##########################################################
##########################################################
### Replication Materials for
### Stefan Müller, Garrett Kennedy, and Tomas Maher:
### Reactions to Experts in Deliberative Democracy: The 2016-2018 Irish Citizens' Assembly
### Irish Political Studies
### 
### Please get in touch with the authors if you have any questions: 
### stefan.mueller@ucd.ie
### You find detailed instructions on the replication materials 
### in the file 0000_readme.pdf
##########################################################
##########################################################


## Reproduce main analysis

# Load packages ----

library(tidyverse)          # CRAN v1.3.2
library(forcats)            # CRAN v1.0.0
library(lubridate)          # CRAN v1.9.1
library(quanteda)           # CRAN v3.3.0
library(quanteda.textstats) # CRAN v0.96
library(xtable)             # CRAN v1.8-4
library(cowplot)            # CRAN v1.1.1
library(stringr)            # CRAN v1.5.0
library(ggeffects)          # CRAN v1.1.5
library(stm)                # CRAN v1.3.6
library(texreg)             # CRAN v1.38.6
library(broom)              # CRAN v1.0.3
library(furrr)              # CRAN v0.3.1

# set working directory here or use the here package

# load custom ggplot2 scheme

theme_baser <- function (){
    theme_minimal()  %+replace%
        theme(panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.major.y = element_blank(),
              panel.border = element_rect(fill = NA,colour = "black", linewidth = 0.5,
                                          linetype = "solid"),
              legend.title = element_text(size = 15),
              plot.caption = element_text(colour = "grey30", size = 11, hjust = 1),
              plot.title = element_text(size = 15, hjust = 0.5, face = "bold",
                                        margin = margin(b = 5, r = 5, l = 5, t = 5)),
              legend.position = "bottom",
              axis.ticks.y = element_line(size = 0.3),
              axis.ticks.x = element_line(size = 0.3),
              axis.ticks.length = unit(0.2, "cm"),
              legend.text=element_text(size = 13),
              panel.background = element_rect(fill='white'), #transparent panel bg
              plot.background = element_rect(fill='white', color= "white"), #transparent plot bg
              strip.text = element_text(size = 15, hjust = 0.5, face = "bold",
                                        margin = margin(b = 5, r = 5, l = 5, t = 5)),
              axis.text.y = element_text(colour = "black", size = 13, hjust = 1),
              axis.text.x = element_text(colour = "black", size = 13),
              axis.title = element_text(size = 13, hjust = 0.5))
}



# set theme
theme_set(theme_baser())


# function for predicted probability plots
plot_continuous <- function(x, 
                            xlab = "Topic Prevalence of Input",
                            ylab = "Predicted Maximum Topic Prevalence\nAcross Subsequent Q&A Sessions") {
    
    data <- x
    
    data$group_factor <- factor(data$group, levels = c("Expert Input", "Other Agenda Item"))
    
    ggplot(data = data, aes(x = x, y = predicted,
                            ymin = conf.low, ymax = conf.high)) +
        geom_ribbon(fill = "grey80") +
        geom_line(linewidth = 0.8) +
        facet_wrap(~group_factor) +
        scale_y_continuous(breaks = c(seq(0, 7, 0.1))) +
        scale_x_continuous(breaks = c(seq(0, 1, 0.2))) +
        labs(x = xlab,
             y = ylab)
    
}




plot_discrete <- function(x, 
                          xlab = "Topic Prevalence of Input",
                          ylab = "Predicted Maximum Topic Prevalence\nAcross Subsequent Q&A Sessions") {
    
    data <- x
    
    data$group_factor <- factor(data$group, levels = c("Expert Input", "Other Agenda Item"))
    
    ggplot(data = data, aes(x = x, y = predicted,
                            ymin = conf.low, 
                            ymax = conf.high)) +
        geom_point(size =3) +
        geom_linerange() +
        facet_wrap(~group_factor) +
        scale_y_continuous(breaks = c(seq(0, 7, 0.1))) +
        labs(x = xlab,
             y = ylab)
    
}

plot_bar <-  function(x, xlab = "Estimated Topic Proportion") {
    
    data <- x
    
    ggplot(data, 
           aes(x = mean, 
               y = reorder(topic_name, mean))) +
        geom_bar(stat = "identity") +
        geom_text(aes(label = paste0(round(mean * 100, 1), "%")), 
                  hjust = 1, nudge_x = -0.01, size = 3,
                  colour = "white") +
        facet_wrap(~topic_main) +
        geom_text(aes(label = words_frex6), 
                  hjust = 0, nudge_x = 0.01,
                  colour = "grey50") +
        scale_x_continuous(limits = c(0, 0.6),
                           labels = scales::percent_format(accuracy = 1),
                           breaks = c(seq(0, 0.6, 0.1))) +
        labs(x = xlab,
             y = NULL)
    
}


# Figure 01 ----

dat <- read.csv("coding_citizens_assembly.csv",
                fileEncoding = "UTF-8")

table(dat$type)

dat_recoded <- dat %>% 
    mutate(type_clean = case_when(
        str_detect(type, "onclu") ~ "Concluding Remarks",
        str_detect(type, "losion") ~ "Concluding Remarks",
        str_detect(type, "losing") ~ "Concluding Remarks",
        str_detect(type, "Feedback") ~ "Feedback",
        str_detect(type, "Information") ~ "Expert Testimonial",
        str_detect(type, "ntroduc") ~ "Welcome/Introduction",
        str_detect(type, "pening") ~ "Welcome/Introduction",
        str_detect(type, "Openning") ~ "Welcome/Introduction",
        str_detect(type, "eflection") ~ "Roundtable Discussion",
        str_detect(type, "Roundtable") ~ "Roundtable Discussion",
        str_detect(type, "Exercise") ~ "Roundtable Discussion",
        str_detect(type, "Q&A") ~ "Q&A Session",
        str_detect(type, "proceeding") ~ "Voting Proceedings",
        str_detect(type, "elcome") ~ "Welcome/Introduction",
        TRUE ~ "Other"
    )) %>% 
    filter(type != "Whole session") %>% 
    filter(type_clean != "Other") %>% 
    mutate(agenda_item_number = 1:n()) %>% 
    mutate(class = ifelse(type_clean == "Expert Testimonial", "Experts",
                          ifelse(type_clean == "Q&A Session", "Q&A",
                                 ifelse(type_clean == "Roundtable Discussion", 
                                        "Roundtable", "Other"))))

table(dat_recoded$type_clean,useNA = "always")

dat_recoded %>% 
    filter(is.na(type_clean)) %>% 
    select(type) %>% 
    unique()

# recode access based on URL

dat_recoded$class <- factor(dat_recoded$class,
                            levels = c("Experts", "Roundtable",
                                       "Q&A", "Other"))

dat_recoded <- dat_recoded %>% 
    mutate(access_recoded = ifelse(is.na(access) & !is.na(url_youtube), "public",
                                   ifelse(access == "" & !is.na(url_youtube), "public",
                                          ifelse(access == "--", "private", access)))) %>% 
    mutate(access_recoded = ifelse(is.na(access), "private", access_recoded)) %>% 
    mutate(access_recoded = str_to_title(access_recoded))

ggplot(dat_recoded, 
       aes(x = agenda_item_number, y = type_clean, colour = access_recoded)) +
    geom_point(shape = "|", size = 6, alpha = 0.8) +
    facet_grid(class~., scales = "free", space = "free") +
    labs(x = "Agenda Item", y = NULL) +
    scale_colour_manual(values = c("grey70", "black"), name = "Access") +
    scale_x_continuous(breaks = c(seq(0, 300, 50))) +
    theme(strip.text.y = element_text(angle = 0, hjust = 0),
          legend.position = "bottom", legend.title = element_text(vjust = 0.5))
ggsave("fig_01.pdf", width = 9, height = 5)
ggsave("fig_01.eps", width = 9, height = 5)



# Table 01 ----

# # save as table

dat_table <- dat_recoded %>% 
    fill(date) %>% 
    filter(date %in% c("2016-11-26")) %>% 
    filter(type != "Welcome") %>% 
    select(Date = date,
           Type = type_clean,
           url_youtube,
           Access = access_recoded,
           Speaker = speaker)

dat_text <- readRDS("data_deliberation_ireland.rds")

dat_text <- dat_text %>% 
    ungroup() %>% 
    select(text, url_youtube)


dat_text$ntoken <- ntoken(corpus(dat_text), remove_punct = TRUE) 

dat_text <- filter(dat_text, !is.na(url_youtube))

dat_merged_table <- left_join(dat_table, dat_text) %>% 
    select(-c(text, url_youtube)) 

dat_merged_table_clean <- dat_merged_table %>% 
    mutate(Speaker = ifelse(is.na(Speaker), "--", Speaker)) %>% 
    mutate(ntoken = format(ntoken, big.mark = ",")) %>% 
    mutate(ntoken = str_squish(ntoken)) |> 
    mutate(ntoken = ifelse(ntoken == "NA", "--", ntoken)) %>% 
    rename(`Length (words)` = ntoken) %>% 
    mutate(Item = 1:n()) %>% 
    select(Item, everything()) %>% 
    select(-Date) %>% 
    filter(Item != 11)

# print data frame for inspection
dat_merged_table_clean

print(xtable(dat_merged_table_clean, 
             type = "latex",
             label="tab:structure_day",
             caption = "Structure of a day during the Irish Citizens' Assembly (26 November 2016)",
             digits = 0), 
      include.rownames = FALSE,
      caption.placement = "top",
      format.args=list(big.mark = ","),
      size = "footnotesize",
      file = "tab_01.tex")


print(xtable(dat_merged_table_clean,
             digits = 0),
      type = "html",
      include.rownames = FALSE,
      caption.placement = "top",
      format.args=list(big.mark = ","),
      size = "footnotesize",
      file = "tab_01.htm")



# Prepare Data ----

# Import data
dat_raw <- readRDS("data_deliberation_ireland.rds")

# Remove NA's
dat_raw_text <- subset(dat_raw, !is.na(text))

# Import reference dataset

dat_ref <- readRDS("topics_ref.rds") 

# Merge both datasets

dat_ref <- dat_ref %>% 
    select(-c("date", "assembly"))

names(dat_ref)[c(2, 3)] <- c("order", "group")

# Remove duplicate value

dat_coded <- left_join(dat_raw_text, dat_ref, by = "title_youtube") %>% 
    filter(docname != "Citizens' Assembly 2017-07-09 1") 

dat_coded$date[dat_coded$order == 232] <- "2017-07-09"

# Create expert variable

dat_coded <- dat_coded %>%
    mutate(type = replace_na(type, "Information Session")) %>% 
    mutate(session_type = ifelse(type == "Q&A" | substr(type, 1, 8) == "Feedback", "Q&A",
                                 ifelse(substr(position_short, 1, 6) == "Expert", "Expert", "Other"))) %>% 
    mutate(session_type = replace_na(session_type, "Other"))

# cleaning the text
dat_coded$text <- str_to_lower(dat_coded$text)
dat_coded$text <- str_replace_all(dat_coded$text, 
                                  c("roach" = "roche", 
                                    "bala" = "ballot",
                                    "tea shock" = "taoiseach",
                                    "kylie" = "kindly", 
                                    "ballast" = "ballots", 
                                    "arrakis" = "oireachtas", 
                                    "approche" = "approach", 
                                    "laphroaig" = "laffoy", 
                                    "coffee" = "", 
                                    "sheehan" = "",
                                    "depressiaon" = "depression",
                                    "volts" = "votes", 
                                    "yeah" = "", 
                                    "table" = "", 
                                    " one " = " ", 
                                    "question" = "",
                                    "questions" = "",
                                    " yes " = " ",
                                    "commissiaon" = "commission",
                                    " no " = " ", 
                                    "\\-" = "", 
                                    "\\\\" = "", 
                                    "ssia" = "ssi",
                                    "depressiave" = "depressive", 
                                    "matter and" = "met eireann", 
                                    "my turn" = "met eireann", 
                                    "met her and her" = "met eireann", 
                                    "my sharon\\'s" = "met eireann", 
                                    "matheran" = "met eireann", 
                                    "metron" = "met eireann",
                                    "windham" = "winter",
                                    "huracan" = "hurricane", 
                                    "greenness got" = "greenhouse gases but", 
                                    "clutter" = "climate", 
                                    "metaphase" = "met office", 
                                    "float" = "flood", 
                                    "weatr" = "weather",
                                    "submissiaons" = "submissions",
                                    " s " = " ",
                                    "sessiaon" = "session"
                                  ))


# Create Corpus

corp <- corpus(dat_coded, docid_field = "docname")

# Get length of sessions

docvars(corp, "session_length") <- ntoken(corp)


corp_relevant <- corpus_subset(corp, group %in% c("8th Amendment",
                                                  "Referenda",
                                                  "Ageing Population",
                                                  "Climate Change"))




# Table 02 ----

# get summary of text corpus
tstat_sum <- textstat_summary(corp_relevant) %>% 
    bind_cols(docvars(corp_relevant)) %>% 
    mutate(qa_dummy = ifelse(session_type == "Q&A", 1,0),
           other_dummy = ifelse(session_type == "Other", 1,0),
           expert_dummy = ifelse(session_type == "Expert", 1, 0))


tstat_sum_table <- tstat_sum %>% 
    group_by(group) %>% 
    summarise(# Documents = n(),
        `Words (mean)` = round(mean(tokens), 0),
        `Words (min)` = round(min(tokens), 0),
        `Words (max)` = round(max(tokens), 0),
        `Expert Inputs` = sum(expert_dummy),
        `Q&A` = sum(qa_dummy),
        `Other` = sum(other_dummy)) %>% 
    rename(Topic = group) %>% 
    mutate(Topic = dplyr::recode(Topic, "8th Amendment" = "Abortion"))

sum(tstat_sum_table$`Expert Inputs`)
sum(tstat_sum_table$`Q&A`)
sum(tstat_sum_table$`Other`)

print(xtable(tstat_sum_table, 
             type = "latex",
             label="tab:summary_all",
             caption = "Overview of documents in text corpus",
             digits = 0), 
      include.rownames = FALSE,
      caption.placement = "top",
      format.args=list(big.mark = ","), 
      size = "footnotesize",
      file = "tab_02.tex")

print(xtable(tstat_sum_table, 
             digits = 0), 
      type = "html",
      include.rownames = FALSE,
      caption.placement = "top",
      format.args=list(big.mark = ","), 
      file = "tab_02.htm")


# Filter corpora into subsets

# Filter to 8th Amendment
corp_eighth <- corpus_subset(corp, group == "8th Amendment")

# Filter to Referenda
corp_referenda <- corpus_subset(corp, group == "Referenda")

# Filter to Ageing Population
corp_ageing <- corpus_subset(corp, group == "Ageing Population")

# Filter to Climate
corp_climate <- corpus_subset(corp, group == "Climate Change")


# Create DFMs

# Eighth 

tok_eighth <- tokens(corp_eighth)

dfm_eighth_raw <- dfm(tok_eighth)

dfm_eighth_raw <- dfm_remove(dfm_eighth_raw, pattern = stopwords("en"))

# Referenda

tok_referenda <- tokens(corp_referenda)

dfm_referenda_raw <- dfm(tok_referenda)

dfm_referenda_raw <- dfm_remove(dfm_referenda_raw, pattern = stopwords("en"))

# Ageing

tok_ageing <- tokens(corp_ageing)

dfm_ageing_raw <- dfm(tok_ageing)

dfm_ageing_raw <- dfm_remove(dfm_ageing_raw, pattern = stopwords("en"))

# Climate

tok_climate <- tokens(corp_climate)

dfm_climate_raw <- dfm(tok_climate)

dfm_climate_raw <- dfm_remove(dfm_climate_raw, pattern = stopwords("en"))



# # Topic Models ----

# Load library, set seed and trim data

set.seed(123)

# trim the documents to remove unnecessary tokens (not necessary for referenda topic)

# Eighth
dfm_eighth_topics <- dfm_trim(dfm_eighth_raw, 
                              min_termfreq = 0.8, termfreq_type = "quantile", 
                              max_docfreq = 0.1, docfreq_type = "prop") %>% 
    dfm_keep(min_nchar = 2)

# Ageing

dfm_ageing_topics <- dfm_trim(dfm_ageing_raw, 
                              min_termfreq = 0.1, termfreq_type = "quantile", 
                              max_docfreq = 0.6, docfreq_type = "prop") %>% 
    dfm_keep(min_nchar = 2)

# Climate

dfm_climate_topics <- dfm_trim(dfm_climate_raw, 
                               min_termfreq = 0.9, termfreq_type = "quantile", 
                               max_docfreq = 0.5, docfreq_type = "prop") %>% 
    dfm_keep(min_nchar = 2)


# Convert to stm 

# filter out "Citizens' Assembly 2017-01-07 12" because it was dropped when making the STM
dfm_eighth_temp <- dfm_subset(dfm_eighth_topics, docname_ != "Citizens' Assembly 2017-01-07 12")

# convert the dfm to a format used for stm
# Eighth

dfm_eighth_topics_stm <- quanteda::convert(dfm_eighth_temp, to = "stm")

# Referenda

dfm_referenda_topics <- dfm_referenda_raw %>% 
    dfm_keep(min_nchar = 2)

# convert the dfm to a format used for stm
dfm_referenda_topics_stm <- quanteda::convert(dfm_referenda_topics, to = "stm")

# Ageing

dfm_ageing_topics_stm <- quanteda::convert(dfm_ageing_topics, to = "stm")

# Climate

dfm_climate_topics_stm <- quanteda::convert(dfm_climate_topics, to='stm')


# Create stm's

# Eighth 
# input selected number of topics here:
K <- 10

# filter out documents that were dropped when making the STM
dfm_eighth_topics_stm <- dfm_subset(dfm_eighth_topics, 
                                    docname_ != "Citizens' Assembly 2017-01-07 12")

dfm_eighth_topics_stm_obj <- convert(dfm_eighth_topics_stm, 
                                     to = "stm")

# generate topics
set.seed(123, kind = "Mersenne-Twister", normal.kind = "Inversion")

stm_eighth <- stm(dfm_eighth_topics_stm_obj$documents,
                  dfm_eighth_topics_stm_obj$vocab,
                  K = K, 
                  verbose = FALSE, max.em.its = 50)


labs_eighth <- labelTopics(stm_eighth, n = 10)


dat_labels_eighth <- data.frame(
    frex = labs_eighth$frex)

dat_labels_eighth$topic <- factor(rownames(dat_labels_eighth))

dat_labels_eighth <- dat_labels_eighth %>% 
    mutate(words_frex4 = paste(frex.1, frex.2, frex.3, frex.4,
                               sep = ", ")) %>% 
    mutate(words_frex6 = paste(frex.1, frex.2, frex.3, frex.4, frex.5, frex.6,
                               sep = ", ")) %>%
    mutate(words_frex = paste(frex.1, frex.2, frex.3, frex.4, frex.5, frex.6, frex.7, frex.8, frex.9, frex.10, 
                              sep = ", ")) %>% 
    mutate(main_topic = "8th Amendment")


dat_labels_eighth <- dat_labels_eighth %>%
    mutate(topic_name = case_when(
        str_detect(words_frex, "publication") ~ "General Proceedings",
        str_detect(words_frex, "ultrasound") ~ "Ultrasounds",
        str_detect(words_frex, "depressed") ~ "Assaults and Mental Health",
        str_detect(words_frex, "offense") ~ "Criminality",
        str_detect(words_frex, "wales") ~ "Travel",
        str_detect(words_frex, "replacement") ~ "Voting/Results",
        str_detect(words_frex, "b1") ~ "Trimesters",
        str_detect(words_frex, "charter") ~ "Human Rights",
        str_detect(words_frex, "president") ~ "Policy",
        str_detect(words_frex, "roche") ~ "Fertilisation"
    ))

table(dat_labels_eighth$topic_name)


dat_prop_eight <- stm_eighth %>%
    make.dt() %>% 
    gather(topic, proportion, -docnum) %>% 
    mutate(topic = str_remove_all(topic, "Topic")) %>% 
    group_by(topic) %>% 
    summarise(mean = mean(proportion, na.rm = TRUE),
              sd = sd(proportion, na.rm = TRUE),
              n = n())  %>% 
    mutate(topic_main = "Abortion (Eighth Amendment)")


dat_sum_eight <- left_join(dat_labels_eighth,
                           dat_prop_eight)



# Referenda
# input selected number of topics here:
K <- 7

dfm_referenda_topics_stm_obj <- dfm_referenda_topics_stm

# generate topics

set.seed(123, kind = "Mersenne-Twister", normal.kind = "Inversion")

stm_referenda <- stm(dfm_referenda_topics_stm_obj$documents,
                     dfm_referenda_topics_stm_obj$vocab,
                     K = K,
                     verbose = FALSE, max.em.its = 50)


labs_referenda <- labelTopics(stm_referenda, n = 10)

dat_labels_referenda <- data.frame(
    frex = labs_referenda$frex)

dat_labels_referenda$topic <- factor(rownames(dat_labels_referenda))

dat_labels_referenda <- dat_labels_referenda %>% 
    mutate(words_frex4 = paste(frex.1, frex.2, frex.3, frex.4, 
                               sep = ", ")) %>% 
    mutate(words_frex6 = paste(frex.1, frex.2, frex.3, frex.4, frex.5, frex.6, 
                               sep = ", ")) %>% 
    mutate(words_frex = paste(frex.1, frex.2, frex.3, frex.4, frex.5, frex.6, frex.7, frex.8, frex.9, frex.10, 
                              sep = ", "))

dat_labels_referenda <- dat_labels_referenda %>%
    mutate(topic_name = case_when(
        str_detect(words_frex, "percent") ~ "Results",
        str_detect(words_frex, "anonymous") ~ "Voting",
        str_detect(words_frex, "constitution") ~ "Constitution",
        str_detect(words_frex, "submissions") ~ "Ballots",
        str_detect(words_frex, "turnout") ~ "Election Turnout",
        str_detect(words_frex, "democracy") ~ "Citizens' Initiatives",
        str_detect(words_frex, "broadcasters") ~ "Media Balance"
    ))  %>% 
    mutate(main_topic = "Referenda")


dat_prop_referenda <- stm_referenda %>%
    make.dt() %>% 
    gather(topic, proportion, -docnum) %>% 
    mutate(topic = str_remove_all(topic, "Topic")) %>% 
    group_by(topic) %>% 
    summarise(mean = mean(proportion, na.rm = TRUE),
              sd = sd(proportion, na.rm = TRUE),
              n = n()) %>% 
    mutate(topic_main = "Referenda")


dat_sum_referenda <- left_join(dat_labels_referenda,
                               dat_prop_referenda)


# Ageing
# input selected number of topics here:
K <- 8

dfm_ageing_topics_stm_obj <- dfm_ageing_topics_stm

# generate topics

set.seed(123, kind = "Mersenne-Twister", normal.kind = "Inversion")

stm_ageing <- stm(dfm_ageing_topics_stm_obj$documents,
                  dfm_ageing_topics_stm_obj$vocab,
                  K = K, 
                  verbose = FALSE, max.em.its = 50)


labs_ageing <- labelTopics(stm_ageing, n = 10)

dat_labels_ageing <- data.frame(
    frex = labs_ageing$frex)

dat_labels_ageing$topic <- factor(rownames(dat_labels_ageing))

dat_labels_ageing <- dat_labels_ageing %>% 
    mutate(words_frex4 = paste(frex.1, frex.2, frex.3, frex.4, 
                               sep = ", ")) %>% 
    mutate(words_frex6 = paste(frex.1, frex.2, frex.3, frex.4, frex.5, frex.6,
                               sep = ", ")) %>% 
    mutate(words_frex = paste(frex.1, frex.2, frex.3, frex.4, frex.5, frex.6, frex.7, frex.8, frex.9, frex.10, 
                              sep = ", "))

dat_labels_ageing <- dat_labels_ageing %>%
    mutate(topic_name = case_when(
        str_detect(words_frex, "recommendation") ~ "Assembly Procedures",
        str_detect(words_frex, "gdp") ~ "Economy",
        str_detect(words_frex, "housing") ~ "Housing",
        str_detect(words_frex, "votes") ~ "Voting",
        str_detect(words_frex, "firms") ~ "Pensions: Contributions",
        str_detect(words_frex, "census") ~ "Data and Demographics",
        str_detect(words_frex, "poverty") ~ "Pensions: Poverty",
        str_detect(words_frex, "activities") ~ "Quality of Life"
    )) %>% 
    mutate(main_topic = "Ageing Population")


dat_prop_ageing <- stm_ageing %>%
    make.dt() %>% 
    gather(topic, proportion, -docnum) %>% 
    mutate(topic = str_remove_all(topic, "Topic")) %>% 
    group_by(topic) %>% 
    summarise(mean = mean(proportion, na.rm = TRUE),
              sd = sd(proportion, na.rm = TRUE),
              n = n()) %>% 
    mutate(topic_main = "Ageing Population")


dat_sum_ageing <- left_join(dat_labels_ageing,
                            dat_prop_ageing)


# Climate
# input selected number of topics here:
K <- 9


# filter out documents that were dropped when making the STM
dfm_climate_topics_stm <- dfm_subset(dfm_climate_topics, 
                                     !(docname_ %in%  c("Citizens' Assembly 2017-10-01 15", "Citizens' Assembly 2017-11-04 6", "Citizens' Assembly 2017-11-05 1")))


dfm_climate_topics_stm_obj <- convert(dfm_climate_topics_stm, 
                                      to = "stm")

# generate topics
stm_climate <- stm(dfm_climate_topics_stm_obj$documents,
                   dfm_climate_topics_stm_obj$vocab,
                   K = K, seed = 123, 
                   verbose = FALSE, max.em.its = 50)


labs_climate <- labelTopics(stm_climate, n = 10)

dat_labels_climate <- data.frame(
    frex = labs_climate$frex)

dat_labels_climate$topic <- factor(rownames(dat_labels_climate))

dat_labels_climate <- dat_labels_climate %>% 
    mutate(words_frex4 = paste(frex.1, frex.2, frex.3, frex.4, 
                               sep = ", ")) %>%
    mutate(words_frex6 = paste(frex.1, frex.2, frex.3, frex.4, frex.5, frex.6,
                               sep = ", ")) %>% 
    mutate(words_frex = paste(frex.1, frex.2, frex.3, frex.4, frex.5, frex.6, frex.7, frex.8, frex.9, frex.10, 
                              sep = ", ")) %>% 
    mutate(main_topic = "Climate Change")

dat_labels_climate <- dat_labels_climate %>%
    mutate(topic_name = case_when(
        str_detect(words_frex, "ballot") ~ "Ballots: Results",
        str_detect(words_frex, "resources") ~ "Resources",
        str_detect(words_frex, "draft") ~ "Legislation",
        str_detect(words_frex, "heating") ~ "Energy",
        str_detect(words_frex, "bus") ~ "Transport",
        str_detect(words_frex, "suggestion") ~ "Assembly Contributions",
        str_detect(words_frex, "communities") ~ "Community Engagement",
        str_detect(words_frex, "temperature") ~ "Weather",
        str_detect(words_frex, "farm") ~ "Agriculture"
    ))

dat_prop_climate <- stm_climate %>%
    make.dt() %>% 
    gather(topic, proportion, -docnum) %>% 
    mutate(topic = str_remove_all(topic, "Topic")) %>% 
    group_by(topic) %>% 
    summarise(mean = mean(proportion, na.rm = TRUE),
              sd = sd(proportion, na.rm = TRUE),
              n = n()) %>% 
    mutate(topic_main = "Climate Change")

dat_sum_climate <- left_join(dat_labels_climate,
                             dat_prop_climate)




# Figure 02 ----

dat_sum_combined <- bind_rows(
    dat_sum_ageing,
    dat_sum_eight,
    dat_sum_climate,
    dat_sum_referenda
)


p_ageing <- plot_bar(dat_sum_ageing)
p_ageing


p_climate <- plot_bar(dat_sum_climate)
p_climate

p_referenda <- plot_bar(dat_sum_referenda)
p_referenda


p_eight <- plot_bar(dat_sum_eight)
p_eight

cowplot::plot_grid(p_eight,
                   p_ageing,
                   p_climate,
                   p_referenda,
                   nrow = 4,
                   align = "v")
ggsave("fig_02.pdf",
       width = 9.3, height = 12)
ggsave("fig_02.eps",
       width = 9.3, height = 12)


# Creating STMs and dataframes ----

# Change stm into a table for easier manipulation

# Eighth

stm_table_eighth <- make.dt(stm_eighth, docvars(dfm_eighth_topics_stm))

write.csv(stm_table_eighth, "stm_table_eighth.csv",
          fileEncoding = "UTF-8")

# Referenda

stm_table_referenda <- make.dt(stm_referenda, docvars(dfm_referenda_topics))

write.csv(stm_table_referenda, "stm_table_referenda.csv",
          fileEncoding = "UTF-8")

# Ageing

stm_table_ageing <- make.dt(stm_ageing, docvars(dfm_ageing_topics))

write.csv(stm_table_ageing, "stm_table_ageing.csv",
          fileEncoding = "UTF-8")

# Climate

stm_table_climate <- make.dt(stm_climate, docvars(dfm_climate_topics_stm))

write.csv(stm_table_climate, "stm_table_climate.csv",
          fileEncoding = "UTF-8")


# save the data with topic labels

dat_labels <- bind_rows(
    dat_labels_ageing,
    dat_labels_climate,
    dat_labels_eighth,
    dat_labels_referenda
)


write.csv(dat_labels, "data_labels.csv", fileEncoding = "UTF-8")


# *Calculating next 5 topics*

# Eighth 

# set threshold
t <-  0
K <-  10

topic_table <- stm_table_eighth

# This renames the topics so that it's easier to move the table to a tidier format later on
topic_table <- topic_table %>% 
    rename(
        topic_prop.topic_1 = Topic1,
        topic_prop.topic_2 = Topic2,
        topic_prop.topic_3 = Topic3,
        topic_prop.topic_4 = Topic4,
        topic_prop.topic_5 = Topic5,
        topic_prop.topic_6 = Topic6,
        topic_prop.topic_7 = Topic7,
        topic_prop.topic_8 = Topic8,
        topic_prop.topic_9 = Topic9,
        topic_prop.topic_10 = Topic10,
    )

# the code below cycles through every document (rows) and every Sub-Topic (Columns) to see if the document mentions a topic (if its above the threshold - currently set to 0) and if it does it then creates a new column for each of the next 5 documents giving the topic prevalence of those documents but only if they are Q&A sessions. 

# The code basically says: Create a new column with the column name indicating whether its the next video or the video after that etc. and the topic number <- If it's not the last video check if the expert's mentions the topic enough to push it over the threshold, if they do, check if the next video is a Q&A session, if it is paste in the value from that Q&A session, if not don't paste anything. Repeat for the next 4 agenda items.

for(i in 1:nrow(topic_table)){
    for(j in 1:K){
        # Next session
        topic_table[i, paste("next_agenda_item_1.topic_", j, sep = '')] <- ifelse(i != nrow(topic_table),
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+1] == "Q&A", as.character(topic_table[i+1, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")),NA)
        
        # Two sessions later
        topic_table[i, paste("next_agenda_item_2.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 1), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+2] == "Q&A", as.character(topic_table[i+2, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Three sessions later
        topic_table[i, paste("next_agenda_item_3.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 2), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+3] == "Q&A", as.character(topic_table[i+3, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Four sessions later
        topic_table[i, paste("next_agenda_item_4.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 3), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+4] == "Q&A", as.character(topic_table[i+4, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Five sessions later
        topic_table[i, paste("next_agenda_item_5.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 4), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+5] == "Q&A", as.character(topic_table[i+5, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
    }
}

topic_table_eighth <- topic_table

# Referenda

# set threshold
t <-  0
K <- 7

topic_table <- stm_table_referenda

topic_table <- topic_table %>% 
    rename(
        topic_prop.topic_1 = Topic1,
        topic_prop.topic_2 = Topic2,
        topic_prop.topic_3 = Topic3,
        topic_prop.topic_4 = Topic4,
        topic_prop.topic_5 = Topic5,
        topic_prop.topic_6 = Topic6,
        topic_prop.topic_7 = Topic7
    )

for(i in 1:nrow(topic_table)){
    for(j in 1:K){
        # Next session
        topic_table[i, paste("next_agenda_item_1.topic_", j, sep = '')] <- ifelse(i != nrow(topic_table),
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+1] == "Q&A", as.character(topic_table[i+1, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")),NA)
        
        # Two sessions later
        topic_table[i, paste("next_agenda_item_2.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 1), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+2] == "Q&A", as.character(topic_table[i+2, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Three sessions later
        topic_table[i, paste("next_agenda_item_3.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 2), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+3] == "Q&A", as.character(topic_table[i+3, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Four sessions later
        topic_table[i, paste("next_agenda_item_4.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 3), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+4] == "Q&A", as.character(topic_table[i+4, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Five sessions later
        topic_table[i, paste("next_agenda_item_5.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 4), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+5] == "Q&A", as.character(topic_table[i+5, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
    }
}

topic_table_referenda <- topic_table

# Ageing

# set threshold
t <-  0
K <-  8


topic_table <- stm_table_ageing

topic_table <- topic_table %>% 
    rename(
        topic_prop.topic_1 = Topic1,
        topic_prop.topic_2 = Topic2,
        topic_prop.topic_3 = Topic3,
        topic_prop.topic_4 = Topic4,
        topic_prop.topic_5 = Topic5,
        topic_prop.topic_6 = Topic6,
        topic_prop.topic_7 = Topic7,
        topic_prop.topic_8 = Topic8
    )

for(i in 1:nrow(topic_table)){
    for(j in 1:K){
        # Next session
        topic_table[i, paste("next_agenda_item_1.topic_", j, sep = '')] <- ifelse(i != nrow(topic_table),
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+1] == "Q&A", as.character(topic_table[i+1, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")),NA)
        
        # Two sessions later
        topic_table[i, paste("next_agenda_item_2.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 1), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+2] == "Q&A", as.character(topic_table[i+2, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Three sessions later
        topic_table[i, paste("next_agenda_item_3.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 2), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+3] == "Q&A", as.character(topic_table[i+3, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Four sessions later
        topic_table[i, paste("next_agenda_item_4.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 3), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+4] == "Q&A", as.character(topic_table[i+4, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Five sessions later
        topic_table[i, paste("next_agenda_item_5.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 4), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+5] == "Q&A", as.character(topic_table[i+5, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
    }
}

topic_table_ageing <- topic_table

# Climate

t <- 0
K <- 9


topic_table <- stm_table_climate

topic_table <- topic_table %>% 
    rename(
        topic_prop.topic_1 = Topic1,
        topic_prop.topic_2 = Topic2,
        topic_prop.topic_3 = Topic3,
        topic_prop.topic_4 = Topic4,
        topic_prop.topic_5 = Topic5,
        topic_prop.topic_6 = Topic6,
        topic_prop.topic_7 = Topic7,
        topic_prop.topic_8 = Topic8,
        topic_prop.topic_9 = Topic9,
    )


for(i in 1:nrow(topic_table)){
    for(j in 1:K){
        # Next session
        topic_table[i, paste("next_agenda_item_1.topic_", j, sep = '')] <- ifelse(i != nrow(topic_table),
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+1] == "Q&A", as.character(topic_table[i+1, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")),NA)
        
        # Two sessions later
        topic_table[i, paste("next_agenda_item_2.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 1), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+2] == "Q&A", as.character(topic_table[i+2, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Three sessions later
        topic_table[i, paste("next_agenda_item_3.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 2), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+3] == "Q&A", as.character(topic_table[i+3, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Four sessions later
        topic_table[i, paste("next_agenda_item_4.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 3), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+4] == "Q&A", as.character(topic_table[i+4, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
        # Five sessions later
        topic_table[i, paste("next_agenda_item_5.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 4), 
                                                                                  (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                          (ifelse(topic_table$session_type[i+5] == "Q&A", as.character(topic_table[i+5, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                          "TOPIC NOT MENTIONED")), NA)
        
    }
}

topic_table_climate <- topic_table



# Pivot the table into tidy format  


# Tidy the table so that each row represents a specific document and topic. 

# Eighth
tidy_table_eighth <- topic_table_eighth %>% 
    pivot_longer(names_to = c(".value", "topic"),
                 names_sep = "\\.",
                 names_repair = "unique",
                 cols = na.omit(str_extract(names(topic_table_eighth),
                                            "[:graph:]+topic[:graph:]+")))

# Referenda

tidy_table_referenda <- topic_table_referenda %>% 
    pivot_longer(names_to = c(".value", "topic"),
                 names_sep = "\\.",
                 names_repair = "unique",
                 cols = na.omit(str_extract(names(topic_table_referenda),
                                            "[:graph:]+topic[:graph:]+")))

# Ageing

tidy_table_ageing <- topic_table_ageing %>% 
    pivot_longer(names_to = c(".value", "topic"),
                 names_sep = "\\.",
                 names_repair = "unique",
                 cols = na.omit(str_extract(names(topic_table_ageing),
                                            "[:graph:]+topic[:graph:]+")))

# Climate

tidy_table_climate <- topic_table_climate %>% 
    pivot_longer(names_to = c(".value", "topic"),
                 names_sep = "\\.",
                 names_repair = "unique",
                 cols = na.omit(str_extract(names(topic_table_climate),
                                            "[:graph:]+topic[:graph:]+")))



# Clean the data & calculate the maximum value in Q&A sessions


# Create date lookup table
lookup_date <- dat_coded$date
names(lookup_date) <- dat_coded$order

# Eighth

tidy_table_eighth <- tidy_table_eighth %>% 
    filter(next_agenda_item_1 != "TOPIC NOT MENTIONED") %>% 
    filter(is.na(next_agenda_item_1) == FALSE) %>% 
    mutate(next_agenda_item_1 = as.numeric(next_agenda_item_1)) %>% 
    mutate(next_agenda_item_2 = as.numeric(next_agenda_item_2)) %>%
    mutate(next_agenda_item_3 = as.numeric(next_agenda_item_3)) %>%
    mutate(next_agenda_item_4 = as.numeric(next_agenda_item_4)) %>%
    mutate(next_agenda_item_5 = as.numeric(next_agenda_item_5)) %>% 
    mutate(next_agenda_item_1_days = unname(lookup_date[(as.character((order) + (1)))]) - date) %>% 
    mutate(next_agenda_item_2_days = unname(lookup_date[(as.character((order) + (2)))]) - date) %>%
    mutate(next_agenda_item_3_days = unname(lookup_date[(as.character((order) + (3)))]) - date) %>%
    mutate(next_agenda_item_4_days = unname(lookup_date[(as.character((order) + (4)))]) - date) %>%
    mutate(next_agenda_item_5_days = unname(lookup_date[(as.character((order) + (5)))]) - date) %>%
    mutate(next_agenda_item_1 = ifelse(next_agenda_item_1_days > 2, NA, next_agenda_item_1)) %>%
    mutate(next_agenda_item_2 = ifelse(next_agenda_item_2_days > 2, NA, next_agenda_item_2)) %>% 
    mutate(next_agenda_item_3 = ifelse(next_agenda_item_3_days > 2, NA, next_agenda_item_3)) %>% 
    mutate(next_agenda_item_4 = ifelse(next_agenda_item_4_days > 2, NA, next_agenda_item_4)) %>% 
    mutate(next_agenda_item_5 = ifelse(next_agenda_item_5_days > 2, NA, next_agenda_item_5)) %>% 
    filter(is.na(next_agenda_item_1) == FALSE | 
               is.na(next_agenda_item_2) == FALSE |
               is.na(next_agenda_item_3) == FALSE |
               is.na(next_agenda_item_4) == FALSE |
               is.na(next_agenda_item_5) == FALSE) %>% 
    rowwise() %>% 
    mutate(qa_topic_prop_max = max(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))

# Referenda

tidy_table_referenda <- tidy_table_referenda %>% 
    filter(next_agenda_item_1 != "TOPIC NOT MENTIONED") %>% 
    filter(is.na(next_agenda_item_1) == FALSE) %>% 
    mutate(next_agenda_item_1 = as.numeric(next_agenda_item_1)) %>% 
    mutate(next_agenda_item_2 = as.numeric(next_agenda_item_2)) %>%
    mutate(next_agenda_item_3 = as.numeric(next_agenda_item_3)) %>%
    mutate(next_agenda_item_4 = as.numeric(next_agenda_item_4)) %>%
    mutate(next_agenda_item_5 = as.numeric(next_agenda_item_5)) %>% 
    mutate(next_agenda_item_1_days = unname(lookup_date[(as.character((order) + (1)))]) - date) %>% 
    mutate(next_agenda_item_2_days = unname(lookup_date[(as.character((order) + (2)))]) - date) %>%
    mutate(next_agenda_item_3_days = unname(lookup_date[(as.character((order) + (3)))]) - date) %>%
    mutate(next_agenda_item_4_days = unname(lookup_date[(as.character((order) + (4)))]) - date) %>%
    mutate(next_agenda_item_5_days = unname(lookup_date[(as.character((order) + (5)))]) - date) %>%
    mutate(next_agenda_item_1 = ifelse(next_agenda_item_1_days > 2, NA, next_agenda_item_1)) %>%
    mutate(next_agenda_item_2 = ifelse(next_agenda_item_2_days > 2, NA, next_agenda_item_2)) %>% 
    mutate(next_agenda_item_3 = ifelse(next_agenda_item_3_days > 2, NA, next_agenda_item_3)) %>% 
    mutate(next_agenda_item_4 = ifelse(next_agenda_item_4_days > 2, NA, NA)) %>% # removing days 4 & 5 because of agenda
    mutate(next_agenda_item_5 = ifelse(next_agenda_item_5_days > 2, NA, NA)) %>% 
    filter(is.na(next_agenda_item_1) == FALSE | 
               is.na(next_agenda_item_2) == FALSE |
               is.na(next_agenda_item_3) == FALSE |
               is.na(next_agenda_item_4) == FALSE |
               is.na(next_agenda_item_5) == FALSE) %>% 
    rowwise() %>% 
    mutate(qa_topic_prop_max = max(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))

# Ageing

tidy_table_ageing <- tidy_table_ageing %>% 
    filter(next_agenda_item_1 != "TOPIC NOT MENTIONED") %>% 
    filter(is.na(next_agenda_item_1) == FALSE) %>% 
    mutate(next_agenda_item_1 = as.numeric(next_agenda_item_1)) %>% 
    mutate(next_agenda_item_2 = as.numeric(next_agenda_item_2)) %>%
    mutate(next_agenda_item_3 = as.numeric(next_agenda_item_3)) %>%
    mutate(next_agenda_item_4 = as.numeric(next_agenda_item_4)) %>%
    mutate(next_agenda_item_5 = as.numeric(next_agenda_item_5)) %>%
    mutate(next_agenda_item_1_days = unname(lookup_date[(as.character((order) + (1)))]) - date) %>% 
    mutate(next_agenda_item_2_days = unname(lookup_date[(as.character((order) + (2)))]) - date) %>%
    mutate(next_agenda_item_3_days = unname(lookup_date[(as.character((order) + (3)))]) - date) %>%
    mutate(next_agenda_item_4_days = unname(lookup_date[(as.character((order) + (4)))]) - date) %>%
    mutate(next_agenda_item_5_days = unname(lookup_date[(as.character((order) + (5)))]) - date) %>%
    mutate(next_agenda_item_1 = ifelse(next_agenda_item_1_days > 2, NA, next_agenda_item_1)) %>%
    mutate(next_agenda_item_2 = ifelse(next_agenda_item_2_days > 2, NA, next_agenda_item_2)) %>% 
    mutate(next_agenda_item_3 = ifelse(next_agenda_item_3_days > 2, NA, next_agenda_item_3)) %>% 
    mutate(next_agenda_item_4 = ifelse(next_agenda_item_4_days > 2, NA, next_agenda_item_4)) %>% 
    mutate(next_agenda_item_5 = ifelse(next_agenda_item_5_days > 2, NA, next_agenda_item_5)) %>% 
    filter(is.na(next_agenda_item_1) == FALSE | 
               is.na(next_agenda_item_2) == FALSE |
               is.na(next_agenda_item_3) == FALSE |
               is.na(next_agenda_item_4) == FALSE |
               is.na(next_agenda_item_5) == FALSE) %>% 
    rowwise() %>% 
    mutate(qa_topic_prop_max = max(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))

# Climate

tidy_table_climate <- tidy_table_climate %>% 
    filter(next_agenda_item_1 != "TOPIC NOT MENTIONED") %>% 
    filter(is.na(next_agenda_item_1) == FALSE) %>% 
    mutate(next_agenda_item_1 = as.numeric(next_agenda_item_1)) %>% 
    mutate(next_agenda_item_2 = as.numeric(next_agenda_item_2)) %>%
    mutate(next_agenda_item_3 = as.numeric(next_agenda_item_3)) %>%
    mutate(next_agenda_item_4 = as.numeric(next_agenda_item_4)) %>%
    mutate(next_agenda_item_5 = as.numeric(next_agenda_item_5)) %>%
    mutate(next_agenda_item_1_days = unname(lookup_date[(as.character((order) + (1)))]) - date) %>% 
    mutate(next_agenda_item_2_days = unname(lookup_date[(as.character((order) + (2)))]) - date) %>%
    mutate(next_agenda_item_3_days = unname(lookup_date[(as.character((order) + (3)))]) - date) %>%
    mutate(next_agenda_item_4_days = unname(lookup_date[(as.character((order) + (4)))]) - date) %>%
    mutate(next_agenda_item_5_days = unname(lookup_date[(as.character((order) + (5)))]) - date) %>%
    mutate(next_agenda_item_1 = ifelse(next_agenda_item_1_days > 2, NA, next_agenda_item_1)) %>%
    mutate(next_agenda_item_2 = ifelse(next_agenda_item_2_days > 2, NA, next_agenda_item_2)) %>% 
    mutate(next_agenda_item_3 = ifelse(next_agenda_item_3_days > 2, NA, next_agenda_item_3)) %>% 
    mutate(next_agenda_item_4 = ifelse(next_agenda_item_4_days > 2, NA, next_agenda_item_4)) %>% 
    mutate(next_agenda_item_5 = ifelse(next_agenda_item_5_days > 2, NA, next_agenda_item_5)) %>% 
    filter(is.na(next_agenda_item_1) == FALSE | 
               is.na(next_agenda_item_2) == FALSE |
               is.na(next_agenda_item_3) == FALSE |
               is.na(next_agenda_item_4) == FALSE |
               is.na(next_agenda_item_5) == FALSE) %>% 
    rowwise() %>% 
    mutate(qa_topic_prop_max = max(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))


# Calculating control variables for regression 

# Find the order num of the chosen Q&A Session

# Eighth
tidy_table_eighth <- tidy_table_eighth %>% 
    mutate(doc_num_qa = ifelse(next_agenda_item_1 == qa_topic_prop_max & !is.na(next_agenda_item_1 == qa_topic_prop_max), order + 1,
                               ifelse(next_agenda_item_2 == qa_topic_prop_max & !is.na(next_agenda_item_2 == qa_topic_prop_max), order + 2, 
                                      ifelse(next_agenda_item_3 == qa_topic_prop_max & !is.na(next_agenda_item_3 == qa_topic_prop_max), order + 3, 
                                             ifelse(next_agenda_item_4 == qa_topic_prop_max & !is.na(next_agenda_item_4 == qa_topic_prop_max), order + 4, order + 5)))))

# Referenda

tidy_table_referenda <- tidy_table_referenda %>% 
    mutate(doc_num_qa = ifelse(next_agenda_item_1 == qa_topic_prop_max & !is.na(next_agenda_item_1 == qa_topic_prop_max), order + 1,
                               ifelse(next_agenda_item_2 == qa_topic_prop_max & !is.na(next_agenda_item_2 == qa_topic_prop_max), order + 2, 
                                      ifelse(next_agenda_item_3 == qa_topic_prop_max & !is.na(next_agenda_item_3 == qa_topic_prop_max), order + 3, 
                                             ifelse(next_agenda_item_4 == qa_topic_prop_max & !is.na(next_agenda_item_4 == qa_topic_prop_max), order + 4, order + 5)))))

# Ageing

tidy_table_ageing <- tidy_table_ageing  %>% 
    mutate(doc_num_qa = ifelse(next_agenda_item_1 == qa_topic_prop_max & !is.na(next_agenda_item_1 == qa_topic_prop_max), order + 1,
                               ifelse(next_agenda_item_2 == qa_topic_prop_max & !is.na(next_agenda_item_2 == qa_topic_prop_max), order + 2, 
                                      ifelse(next_agenda_item_3 == qa_topic_prop_max & !is.na(next_agenda_item_3 == qa_topic_prop_max), order + 3, 
                                             ifelse(next_agenda_item_4 == qa_topic_prop_max & !is.na(next_agenda_item_4 == qa_topic_prop_max), order + 4, order + 5)))))

# Climate

tidy_table_climate <- tidy_table_climate %>% 
    mutate(doc_num_qa = ifelse(next_agenda_item_1 == qa_topic_prop_max & !is.na(next_agenda_item_1 == qa_topic_prop_max), order + 1,
                               ifelse(next_agenda_item_2 == qa_topic_prop_max & !is.na(next_agenda_item_2 == qa_topic_prop_max), order + 2, 
                                      ifelse(next_agenda_item_3 == qa_topic_prop_max & !is.na(next_agenda_item_3 == qa_topic_prop_max), order + 3, 
                                             ifelse(next_agenda_item_4 == qa_topic_prop_max & !is.na(next_agenda_item_4 == qa_topic_prop_max), order + 4, order + 5)))))



# # create lookup tables

# Create date lookup table
lookup_date <- dat_coded$date
names(lookup_date) <- dat_coded$order

# Lookup table for length of Q&A
lookup_length <- docvars(corp, "session_length")
names(lookup_length) <- docvars(corp, "order")


# Eighth Amendment

# Lookup the date
tidy_table_eighth <- tidy_table_eighth %>% 
    mutate(num_days = unname(lookup_date[(as.character(doc_num_qa))]) - date)

# Lookup the length
tidy_table_eighth <- tidy_table_eighth %>% 
    mutate(duration_qa = unname(lookup_length[(as.character(doc_num_qa))])) %>% 
    mutate(duration_expert = unname(lookup_length[(as.character(order))]))


# Referenda

# Lookup the date
tidy_table_referenda <- tidy_table_referenda %>% 
    mutate(num_days = unname(lookup_date[(as.character(doc_num_qa))]) - date)

# Lookup the length
tidy_table_referenda <- tidy_table_referenda %>% 
    mutate(duration_qa = unname(lookup_length[(as.character(doc_num_qa))])) %>% 
    mutate(duration_expert = unname(lookup_length[(as.character(order))]))


# Ageing

# Lookup the date
tidy_table_ageing <- tidy_table_ageing %>% 
    mutate(num_days = unname(lookup_date[(as.character(doc_num_qa))]) - date)

# Lookup the length
tidy_table_ageing <- tidy_table_ageing %>% 
    mutate(duration_qa = unname(lookup_length[(as.character(doc_num_qa))])) %>% 
    mutate(duration_expert = unname(lookup_length[(as.character(order))]))


# Climate

# Lookup the date
tidy_table_climate <- tidy_table_climate %>% 
    mutate(num_days = unname(lookup_date[(as.character(doc_num_qa))]) - date)

# Lookup the length
tidy_table_climate <- tidy_table_climate %>% 
    mutate(duration_qa = unname(lookup_length[(as.character(doc_num_qa))])) %>% 
    mutate(duration_expert = unname(lookup_length[(as.character(order))]))



# Export tables to CSV for later use

write.csv(tidy_table_eighth, "eighth.csv",
          fileEncoding = "UTF-8")

# Write Referenda CSV

write.csv(tidy_table_referenda, "referenda.csv",
          fileEncoding = "UTF-8")

# Write Ageing CSV

write.csv(tidy_table_ageing, "ageing.csv",
          fileEncoding = "UTF-8")

# Write Climate CSV

write.csv(tidy_table_climate, "climate.csv",
          fileEncoding = "UTF-8")


# Figure 03 ----

# Import datasets

eighth <- read.csv("eighth.csv", encoding = "UTF-8") %>% 
    mutate(num_docs = doc_num_qa - order) %>% 
    mutate(speaker = ifelse(is.na(speaker), "Other", speaker)) %>% 
    mutate(position_short = ifelse(is.na(position_short), "Other", position_short)) %>% 
    rowwise() %>% 
    mutate(qa_topic_prop_sum = sum(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
    mutate(qa_topic_prop_avg = mean(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
    filter(as.numeric(order) < 180) %>% 
    ungroup() %>% 
    mutate(topic_broad = "Abortion (Eight Amendment)")

referenda <- read.csv("referenda.csv", encoding = "UTF-8") %>% 
    mutate(num_docs = doc_num_qa - order) %>%
    mutate(speaker = ifelse(is.na(speaker), "Other", speaker)) %>% 
    mutate(position_short = ifelse(is.na(position_short), "Other", position_short)) %>% 
    rowwise() %>% 
    mutate(qa_topic_prop_sum = sum(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
    mutate(qa_topic_prop_avg = mean(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))%>% 
    ungroup() %>% 
    mutate(topic_broad = "Referenda")

ageing_population <- read.csv("ageing.csv", encoding = "UTF-8")%>% 
    mutate(num_docs = doc_num_qa - order) %>%
    mutate(speaker = ifelse(is.na(speaker), "Other", speaker)) %>% 
    mutate(position_short = ifelse(is.na(position_short), "Other", position_short)) %>%  
    rowwise() %>% 
    mutate(qa_topic_prop_sum = sum(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
    mutate(qa_topic_prop_avg = mean(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))%>% 
    ungroup() %>% 
    mutate(topic_broad = "Ageing Population")


climate_change <- read.csv("climate.csv", encoding = "UTF-8") %>%
    mutate(num_docs = doc_num_qa - order) %>%
    mutate(speaker = ifelse(is.na(speaker), "Other", speaker)) %>% 
    mutate(position_short = ifelse(is.na(position_short), "Other", position_short)) %>% 
    rowwise() %>% 
    mutate(qa_topic_prop_sum = sum(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
    mutate(qa_topic_prop_avg = mean(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))%>% 
    ungroup() %>% 
    mutate(topic_broad = "Climate Change")

# Combine datasets

dat_da_raw <- rbind(eighth, 
                    ageing_population, 
                    referenda,
                    climate_change,
                    stringsAsFactors = FALSE)

# load data with topic labels

dat_labels <- read.csv("data_labels.csv", encoding = "UTF-8") %>% 
    mutate(topic = paste0("topic_", topic)) %>% 
    select(group = main_topic, topic_name, topic) 

head(dat_labels)

# merge clean topic labels
dat_da <- left_join(dat_da_raw, dat_labels)

dat_da <- dat_da %>% 
    mutate(group = dplyr::recode(group, "8th Amendment" = "Abortion (Eighth Amendment)"))

# get counts
dat_check <- dat_da %>% 
    select(docnum, date, group, session_type, session_length) %>% 
    unique()

dat_da <- select(dat_da, 
                 topic, topic_name, group, everything()) 

# Add additional variables: 

dat_da <- dat_da %>% 
    mutate(group_topic = paste(group, topic)) %>% 
    mutate(salience = ifelse(group == "8th Amendment", 13075, ifelse(group == "Climate Change", 1205, ifelse(group == "referenda", 213, 129)))) 

dat_da$position_short <- str_replace_all(dat_da$position_short, "Expert :", "Expert:")

# Create expert only dataset

dat_da_expert <- dat_da %>%
    filter(session_type == "Expert")

# Print summary
summary(dat_da$topic_prop)

# add additional variables

dat_da <- dat_da %>% 
    mutate(group = as.factor(group),
           session_type = as.factor(session_type),
           topic_prop_log = log(topic_prop * 100)) %>% 
    as.data.frame() %>% 
    mutate(topic_group = paste0(topic, group))


# Generate discrete buckets for topic proportions

dat_da <- dat_da %>% 
    group_by(topic_group) %>% 
    mutate(topic_prop_discrete = case_when(topic_prop > quantile(topic_prop, 2/3) ~ "High",
                                           topic_prop > quantile(topic_prop, 1/3) ~ "Medium",
                                           TRUE ~ "Low"))

# Arrange the topic proportion discrete buckets

dat_da$topic_prop_discrete <- factor(dat_da$topic_prop_discrete,
                                     levels = c("Low", 
                                                "Medium",
                                                "High"))

# Add additional descriptive variables

dat_da <- dat_da %>% 
    group_by(date, speaker, group) %>% 
    mutate(min_topic = min(topic_prop),
           max_topic = max(topic_prop),
           range_topics = max_topic - min_topic,
           sd_topics = sd(topic_prop))

# Organise session type levels

dat_da$session_type <- factor(dat_da$session_type,
                              levels = c("Other", 
                                         "Q&A",
                                         "Expert"))

# Fractional logistic regression model with maximum value

table(dat_da$session_type)

dat_da <- dat_da %>% 
    mutate(session_type = ifelse(session_type == "Expert", "Expert Input", "Other Agenda Item"))

dat_da$session_type <- forcats::fct_rev(dat_da$session_type)

table(dat_da$session_type)

dat_da_noprocedure <- dat_da %>% 
    filter(!topic_name %in% c("Voting", "Voting/Results",
                              "General Proceedings",
                              "Ballots: Results", 
                              "Ballots",
                              "Assembly Procedures",
                              "Assembly Contributions"))

glm_max_continuous <- glm(qa_topic_prop_max ~
                              topic_prop * session_type +
                              duration_expert +
                              duration_qa +
                              group +  
                              topic_group +
                              num_docs, 
                          data = dat_da, 
                          family = quasibinomial('logit'))


glm_max_continuous_noproc <- glm(qa_topic_prop_max ~
                                     topic_prop * session_type +
                                     duration_expert +
                                     duration_qa +
                                     group +  
                                     topic_group +
                                     num_docs, 
                                 data = dat_da_noprocedure, 
                                 family = quasibinomial('logit'))

glm_max_discrete <- glm(qa_topic_prop_max ~
                            topic_prop_discrete * session_type +
                            duration_expert +
                            duration_qa +
                            group + 
                            topic_group +
                            num_docs, 
                        data = dat_da, 
                        family = quasibinomial('logit'))

glm_max_discrete_noproc <- glm(qa_topic_prop_max ~
                                   topic_prop_discrete * session_type +
                                   duration_expert +
                                   duration_qa +
                                   group + 
                                   topic_group +
                                   num_docs, 
                               data = dat_da_noprocedure, 
                               family = quasibinomial('logit'))

p_cont_max <- ggpredict(glm_max_continuous, 
                        terms = c("topic_prop [all]", "session_type")) 

p_discrete_max <- ggpredict(glm_max_discrete, 
                            terms = c("topic_prop_discrete", "session_type")) 

p_small <- plot_continuous(x = p_cont_max) +
    scale_y_continuous(limits = c(0, 0.8),
                       breaks = c(seq(0, 0.8, 0.2))) +
    labs(title = "(a) Continuous Measure") +
    theme(plot.title = element_text(face = "plain")) +
    geom_rug(aes(x = topic_prop), data = dat_da,
             inherit.aes = FALSE, colour = "grey20")

p_small_discrete <- plot_discrete(x = p_discrete_max) +
    scale_y_continuous(limits = c(0, 0.8),
                       breaks = c(seq(0, 0.8, 0.2))) +
    labs(title = "(b) Categorical Measure") +
    theme(plot.title = element_text(face = "plain"))

plot_grid(p_small, p_small_discrete, nrow = 1)

ggsave("fig_03.pdf",
       width = 9.5, height = 6.5)
ggsave("fig_03.eps",
       width = 9.5, height = 6.5)


# Table 04 ----

texreg(list(glm_max_continuous, glm_max_continuous_noproc,
            glm_max_discrete, glm_max_discrete_noproc),
       omit.coef = c("(Intercept)|topic_group*|group*|duration_*|num_docs*"),
       fontsize = "footnotesize",
       include.aic = FALSE,
       include.bic = FALSE,
       include.loglik = FALSE,
       custom.model.names = c(
           "M 1 (full)",
           "M 2 (subset)",
           "M 3 (full)",
           "M 4 (subset)"
       ),
       caption.above = TRUE,
       custom.coef.map = list(
           "topic_prop" = "Topic Prevalence",
           "session_typeExpert Input" = "Expert Input (ref.: Other)",
           "topic_prop:session_typeExpert Input" = "Topic Prevalence $\\times$ Expert Input",
           "topic_prop_discreteMedium" = "Topic Prevalence (Discrete): Medium (ref.: Low)",
           "topic_prop_discreteHigh" = "Topic Prevalence (Discrete): High",
           "topic_prop_discreteMedium:session_typeExpert Input" = "Topic Prevalence (Discrete): Medium $\\times$ Expert Input",
           "topic_prop_discreteHigh:session_typeExpert Input" = "Topic Prevalence (Discrete): High $\\times$ Expert Input"
       ),
       caption = "Predicting issue emphasis in Q\\&A sessions",
       label = "tab:main_proc",
       custom.gof.rows = list("Controls" = c(rep("\\checkmark", 4)),
                              "Fixed Effects: 4 Issues" = c(rep("\\checkmark", 4)),
                              "Fixed Effects: Topics" = c(rep("\\checkmark", 4))),
       file = "tab_04.tex")

wordreg(list(glm_max_continuous, glm_max_continuous_noproc,
             glm_max_discrete, glm_max_discrete_noproc),
        omit.coef = c("(Intercept)|topic_group*|group*|duration_*|num_docs*"),
        fontsize = "footnotesize",
        include.aic = FALSE,
        include.bic = FALSE,
        include.loglik = FALSE,
        custom.model.names = c(
            "M 1 (full)",
            "M 2 (subset)",
            "M 3 (full)",
            "M 4 (subset)"
        ),
        caption.above = TRUE,
        custom.coef.map = list(
            "topic_prop" = "Topic Prevalence",
            "session_typeExpert Input" = "Expert Input",
            "topic_prop:session_typeExpert Input" = "Topic Prevalence $\\times$ Expert Input",
            "topic_prop_discreteMedium" = "Topic Prevalence (Discrete): Medium",
            "topic_prop_discreteHigh" = "Topic Prevalence (Discrete): High",
            "topic_prop_discreteMedium:session_typeExpert Input" = "Topic Prevalence (Discrete): Medium $\\times$ Expert Input",
            "topic_prop_discreteHigh:session_typeExpert Input" = "Topic Prevalence (Discrete): High $\\times$ Expert Input"
        ),
        caption = "Predicting issue emphasis in Q\\&A sessions",
        label = "tab:main_proc",
        custom.gof.rows = list("Controls" = c(rep("\\checkmark", 4)),
                               "Fixed Effects: 4 Issues" = c(rep("\\checkmark", 4)),
                               "Fixed Effects: Topics" = c(rep("\\checkmark", 4))),
        file = "tab_04.doc")

htmlreg(list(glm_max_continuous, glm_max_continuous_noproc,
             glm_max_discrete, glm_max_discrete_noproc),
        omit.coef = c("(Intercept)|topic_group*|group*|duration_*|num_docs*"),
        fontsize = "footnotesize",
        include.aic = FALSE,
        include.bic = FALSE,
        include.loglik = FALSE,
        custom.model.names = c(
            "M 1 (full)",
            "M 2 (subset)",
            "M 3 (full)",
            "M 4 (subset)"
        ),
        caption.above = TRUE,
        custom.coef.map = list(
            "topic_prop" = "Topic Prevalence",
            "session_typeExpert Input" = "Expert Input",
            "topic_prop:session_typeExpert Input" = "Topic Prevalence $\\times$ Expert Input",
            "topic_prop_discreteMedium" = "Topic Prevalence (Discrete): Medium",
            "topic_prop_discreteHigh" = "Topic Prevalence (Discrete): High",
            "topic_prop_discreteMedium:session_typeExpert Input" = "Topic Prevalence (Discrete): Medium $\\times$ Expert Input",
            "topic_prop_discreteHigh:session_typeExpert Input" = "Topic Prevalence (Discrete): High $\\times$ Expert Input"
        ),
        caption = "Predicting issue emphasis in Q\\&A sessions",
        label = "tab:main_proc",
        custom.gof.rows = list("Controls" = c(rep("\\checkmark", 4)),
                               "Fixed Effects: 4 Issues" = c(rep("\\checkmark", 4)),
                               "Fixed Effects: Topics" = c(rep("\\checkmark", 4))),
        file = "tab_04.htm")



# Table 05 ----

# load dataset for analysis
dat <- read_csv("data_analysis.csv")

# recode name of 8th amendment
dat <- dat %>% 
    mutate(group = dplyr::recode(group, "8th Amendment" = "Abortion (Eighth Amendment)"))

# get value with highest topic proportion per expert
table(dat$type)

# get the observation of the HIGHEST topic proportion for 
# each item
# then recode the professional background of each expert
dat_highest <- dat %>% 
    arrange(docnum, speaker, -topic_prop) %>% 
    group_by(docnum, speaker) %>% 
    filter(row_number()==1) %>% 
    mutate(speaker = str_replace_all(speaker, ",", ", ")) %>% # improve formatting of string
    mutate(position_expert = case_when(
        str_detect(position_short, "Industry") ~ "Expert: Industry",
        str_detect(position_short, "Academic") ~ "Expert: Academia",
        str_detect(speaker, "Emily") ~ "Expert: Academia"
    )) %>% 
    mutate(position_expert = ifelse(is.na(position_expert), "Other",
                                    position_expert)) 


# load data on gender of experts
dat_gender <- read.csv("data_experts_code.csv")

# select only doc_id and gender variable
dat_gender <- dat_gender |> 
    select(-date) |> 
    select(doc_id, gender) 


# merge with dataset containing highest topic proportion
dat_highest <- left_join(
    dat_highest, dat_gender,
    by = "doc_id"
)


# exclude "other" experts (=the moderator)
dat_highest_experts <- dat_highest %>% 
    filter(position_expert != "Other")

# get differences in topic proportions
dat_highest_experts <- dat_highest_experts %>% 
    mutate(qa_minus_expert = qa_topic_prop_max - topic_prop) |> 
    mutate(qa_minus_expert_avg = qa_topic_prop_avg - topic_prop) |> 
    mutate(expert_minus_qa = topic_prop - qa_topic_prop_max) |> 
    mutate(gender = dplyr::recode(gender,
                                  "M" = "Male",
                                  "F" = "Female")) 

# recode/relevel variables
dat_highest_experts <- dat_highest_experts |> 
    mutate(position_expert = factor(position_expert, levels = c("Expert: Industry", "Expert: Academia"))) |> 
    mutate(gender = factor(gender, levels = c("Male", "Female"))) |> 
    mutate(session_length_1000 = session_length / 1000) # divide length by 1000 words to get more interpretable scale in regression models

# overview of expert types
table(dat_highest_experts$gender,
      dat_highest_experts$position_expert)

dat_highest_experts |> 
    group_by(gender) |> 
    count()

dat_highest_experts |> 
    group_by(position_expert) |> 
    count()

# run models for gender

# base model
lm_gender_highest <- lm(qa_minus_expert ~ gender + 
                            position_expert, 
                        data = dat_highest_experts)

table(dat_highest_experts$gender, 
      dat_highest_experts$group)

# controls
lm_gender_highest_controls <- lm(qa_minus_expert ~ gender + 
                                     position_expert + 
                                     group + num_docs, 
                                 data = dat_highest_experts)


# store regression model and relabel coefficients

texreg(list(lm_gender_highest, lm_gender_highest_controls),
       fontsize = "footnotesize",
       include.aic = FALSE,
       include.bic = FALSE,
       include.loglik = FALSE,
       caption.above = TRUE,
       custom.coef.map = list(
           "(Intercept)" = "(Intercept)",
           "genderFemale" = "Gender: Female (ref.: Male)",
           "position_expertExpert: Academia" = "Background: Academia (ref.: Other)",
           "session_length_1000" = "Length of Testimonial (in 1000 Words)",
           "num_docs" = "Gap Between Testimonial and Q\\&A Session",
           "groupAgeing Population" = "Agenda Item: Ageing Population (ref.: Abortion)",
           "groupClimate Change" = "Agenda Item: Climate Change",
           "groupReferenda" = "Agenda Item: Referenda"
       ),
       caption = "Predicting the difference between the recurrence of the experts' most prevalent and the maximum prevalence of the same topic in subsequent Q\\&A sessions",
       label = "tab:gender_profession",
       file = "tab_05.tex")

# store regression model and relabel coefficients
wordreg(list(lm_gender_highest, lm_gender_highest_controls),
        fontsize = "footnotesize",
        include.aic = FALSE,
        include.bic = FALSE,
        include.loglik = FALSE,
        caption.above = TRUE,
        custom.coef.map = list(
            "(Intercept)" = "(Intercept)",
            "genderFemale" = "Gender: Female (ref.: Male)",
            "position_expertExpert: Academia" = "Background: Academia (ref.: Other)",
            "session_length_1000" = "Length of Testimonial (in 1000 Words)",
            "num_docs" = "Gap Between Testimonial and Q\\&A Session",
            "groupAgeing Population" = "Agenda Item: Ageing Population (ref.: Abortion)",
            "groupClimate Change" = "Agenda Item: Climate Change",
            "groupReferenda" = "Agenda Item: Referenda"
        ),
        caption = "Predicting the difference between the recurrance of the experts' most prevalent and the maximum prevalence of the same topic in subsequent Q\\&A sessions",
        label = "tab:gender_profession",
        file = "tab_05.doc")

htmlreg(list(lm_gender_highest, lm_gender_highest_controls),
        fontsize = "footnotesize",
        include.aic = FALSE,
        include.bic = FALSE,
        include.loglik = FALSE,
        caption.above = TRUE,
        custom.coef.map = list(
            "(Intercept)" = "(Intercept)",
            "genderFemale" = "Gender: Female (ref.: Male)",
            "position_expertExpert: Academia" = "Background: Academia (ref.: Other)",
            "session_length_1000" = "Length of Testimonial (in 1000 Words)",
            "num_docs" = "Gap Between Testimonial and Q\\&A Session",
            "groupAgeing Population" = "Agenda Item: Ageing Population (ref.: Abortion)",
            "groupClimate Change" = "Agenda Item: Climate Change",
            "groupReferenda" = "Agenda Item: Referenda"
        ),
        caption = "Predicting the difference between the recurrance of the experts' most prevalent and the maximum prevalence of the same topic in subsequent Q\\&A sessions",
        label = "tab:gender_profession",
        file = "tab_05.htm")


# Figure 04 ----

# prepare data for decriptive plots
dat_highest_experts_ci <- dat_highest_experts |> 
    group_by(gender) |> 
    summarise(mean = mean(qa_minus_expert),
              se = sd(qa_minus_expert) / sqrt(n()))

# plot distribution based on gender
p_gender <- ggplot(dat_highest_experts, aes(x = gender,
                                            y = qa_minus_expert)) +
    ggbeeswarm::geom_quasirandom(colour = "grey80",
                                 width = 0.3,
                                 size = 2.5) +
    geom_point(data = dat_highest_experts_ci,
               size = 5, shape = 15,
               aes(x  = gender, y = mean),
               colour = "black", 
               inherit.aes = FALSE) +
    geom_linerange(data = dat_highest_experts_ci,
                   size = 1.05,
                   aes(x  = gender,
                       ymin = mean - 1.96 * se, ymax = mean + 1.96 * se),
                   colour = "black", 
                   inherit.aes = FALSE) +
    coord_flip() +
    geom_text(data = dat_highest_experts_ci,
              aes(label = gender,
                  x = gender, y = mean), nudge_x = 0.25,
              size = 4.5,
              inherit.aes= FALSE) +
    scale_y_continuous(breaks = c(seq(-1, 0.2, 0.2))) +
    annotate("text", y = -1, x = 1.5, 
             colour = "grey50", 
             size = 4, 
             hjust = 0, 
             label = "More Influence") +
    geom_segment(aes(x = 1.35, y = -1, xend = 1.35, yend = -0.7),
                 colour = "grey50",
                 arrow = arrow(length = unit(0.3, "cm"))) +
    labs(x = NULL,
         title = "Gender",
         y = "") +
    theme(axis.text.y = element_blank(),
          axis.ticks.y = element_blank())


# repeat for academics vs experts from other backgrounds
dat_highest_experts <- dat_highest_experts |> 
    mutate(position_expert_dummy = ifelse(position_expert == "Expert: Industry", 
                                          "Other Background",
                                          "Academic"))

dat_highest_experts_aca_ci <- dat_highest_experts |> 
    group_by(position_expert_dummy) |> 
    summarise(mean = mean(qa_minus_expert),
              se = sd(qa_minus_expert) / sqrt(n()))

p_position <- ggplot(dat_highest_experts, aes(x = position_expert_dummy,
                                              y = qa_minus_expert)) +
    ggbeeswarm::geom_quasirandom(colour = "grey80",
                                 width = 0.3,
                                 size = 2.5) +
    geom_point(data = dat_highest_experts_aca_ci,
               size = 5, shape = 15,
               aes(x  = position_expert_dummy, y = mean),
               colour = "black", 
               inherit.aes = FALSE) +
    geom_linerange(data = dat_highest_experts_aca_ci,
                   size = 1.05,
                   aes(x  = position_expert_dummy,
                       ymin = mean - 1.96 * se, ymax = mean + 1.96 * se),
                   colour = "black", 
                   inherit.aes = FALSE) +
    coord_flip() +
    geom_text(data = dat_highest_experts_aca_ci,
              aes(label = position_expert_dummy,
                  x = position_expert_dummy, y = mean), nudge_x = 0.25,
              size = 4.5,
              inherit.aes= FALSE) +
    scale_y_continuous(breaks = c(seq(-1, 0.2, 0.2))) +
    annotate("text", y = -1, x = 1.5, 
             colour = "grey50", 
             size = 4, 
             hjust = 0, 
             label = "More Influence") +
    geom_segment(aes(x = 1.35, y = -1, xend = 1.35, yend = -0.7),
                 colour = "grey50",
                 arrow = arrow(length = unit(0.3, "cm"))) +
    labs(x = NULL,
         title = "Professional Background",
         y = "Difference between Expert's Most Prevalent Topic and Q&As' Focus on the Same Topic") +
    theme(axis.text.y = element_blank(),
          axis.ticks.y = element_blank())


# combine and save both plots
cowplot::plot_grid(p_gender, p_position, nrow = 2)
ggsave("fig_04.pdf",
       width = 9, height = 5.5)
ggsave("fig_04.eps",
       width = 9, height = 5.5)


# Table 06 ----

# load data on gender of experts
dat_gender <- read.csv("data_experts_code.csv")

dat_gender <- dat_gender |> 
    select(-date) |> 
    select(doc_id, gender) |> 
    mutate(gender = dplyr::recode(gender, "M" = "Male Expert",
                                  "F" = "Female Expert"))


# merge with dataset containing highest topic proportion
dat_extended <- left_join(
    dat_da, dat_gender,
    by = "doc_id"
)

dat_extended_experts <- dat_extended |> 
    filter(session_type == "Expert Input") |> 
    mutate(gender = factor(gender, levels = c("Male Expert",
                                              "Female Expert")))
glm_max_continuous_gender <- glm(qa_topic_prop_max ~
                                     topic_prop * gender +
                                     duration_expert +
                                     duration_qa +
                                     group +
                                     topic_group +
                                     num_docs,
                                 data = dat_extended_experts,
                                 family = quasibinomial('logit'))




# repeat for academics vs experts from other backgrounds
dat_extended_experts <- dat_extended_experts |>
    mutate(position_expert = case_when(
        str_detect(position_short, "Industry") ~ "Expert: Industry",
        str_detect(position_short, "Academic") ~ "Expert: Academia",
        str_detect(speaker, "Emily") ~ "Expert: Academia"
    )) %>%
    mutate(position_expert = ifelse(is.na(position_expert), "Other",
                                    position_expert)) %>%
    mutate(position_expert_dummy = ifelse(position_expert == "Expert: Industry",
                                          "Other Background",
                                          "Academia")) |>
    mutate(position_expert_dummy = factor(position_expert_dummy, levels = c("Other Background",
                                                                            "Academia")))

glm_max_continuous_profession <- glm(qa_topic_prop_max ~
                                         topic_prop * position_expert_dummy +
                                         duration_expert +
                                         #  gender +
                                         duration_qa +
                                         group +
                                         topic_group +
                                         num_docs,
                                     data = dat_extended_experts,
                                     family = quasibinomial('logit'))


# save as regression table

texreg(list(glm_max_continuous_gender, glm_max_continuous_profession),
       
       custom.coef.map = list(
           "topic_prop" = "Topic Prevalence",
           "genderFemale Expert" = "Female Expert (ref: Male Expert)",
           "topic_prop:genderFemale Expert" = "Topic Prevalence $\\times$ Female Expert",
           "position_expert_dummyAcademia" = "Background: Academic (ref.: Other)",
           "topic_prop:position_expert_dummyAcademia" = "Topic Prevalence $\\times$ Background: Academic"
       ),
       fontsize = "footnotesize",
       include.aic = FALSE,
       include.bic = FALSE,
       include.loglik = FALSE,
       custom.model.names = c(
           "M 1 (Gender)",
           "M 2 (Profession)"),
       caption.above = TRUE,
       caption = "Predicting issue emphasis in Q\\&A sessions conditional on the gender and professional background of experts",
       label = "tab:reg_gender_background",
       custom.gof.rows = list("Controls" = c(rep("\\checkmark", 2)),
                              "Fixed Effects: 4 Issues" = c(rep("\\checkmark", 2)),
                              "Fixed Effects: Topics" = c(rep("\\checkmark", 2))),
       file = "tab_06.tex")

wordreg(list(glm_max_continuous_gender, glm_max_continuous_profession),
        
        custom.coef.map = list(
            "topic_prop" = "Topic Prevalence",
            "genderFemale Expert" = "Female Expert (ref: Male Expert)",
            "topic_prop:genderFemale Expert" = "Topic Prevalence $\\times$ Female Expert",
            "position_expert_dummyAcademia" = "Background: Academic (ref.: Other)",
            "topic_prop:position_expert_dummyAcademia" = "Topic Prevalence $\\times$ Background: Academic"
        ),
        fontsize = "footnotesize",
        include.aic = FALSE,
        include.bic = FALSE,
        include.loglik = FALSE,
        custom.model.names = c(
            "M 1 (Gender)",
            "M 2 (Profession)"),
        caption.above = TRUE,
        caption = "Predicting issue emphasis in Q\\&A sessions conditional on the gender and professional background of experts",
        label = "tab:reg_gender_background",
        custom.gof.rows = list("Controls" = c(rep("\\checkmark", 2)),
                               "Fixed Effects: 4 Issues" = c(rep("\\checkmark", 2)),
                               "Fixed Effects: Topics" = c(rep("\\checkmark", 2))),
        file = "tab_06.doc")

htmlreg(list(glm_max_continuous_gender, glm_max_continuous_profession),
        
        custom.coef.map = list(
            "topic_prop" = "Topic Prevalence",
            "genderFemale Expert" = "Female Expert (ref: Male Expert)",
            "topic_prop:genderFemale Expert" = "Topic Prevalence $\\times$ Female Expert",
            "position_expert_dummyAcademia" = "Background: Academic (ref.: Other)",
            "topic_prop:position_expert_dummyAcademia" = "Topic Prevalence $\\times$ Background: Academic"
        ),
        fontsize = "footnotesize",
        include.aic = FALSE,
        include.bic = FALSE,
        include.loglik = FALSE,
        custom.model.names = c(
            "M 1 (Gender)",
            "M 2 (Profession)"),
        caption.above = TRUE,
        caption = "Predicting issue emphasis in Q\\&A sessions conditional on the gender and professional background of experts",
        label = "tab:reg_gender_background",
        custom.gof.rows = list("Controls" = c(rep("\\checkmark", 2)),
                               "Fixed Effects: 4 Issues" = c(rep("\\checkmark", 2)),
                               "Fixed Effects: Topics" = c(rep("\\checkmark", 2))),
        file = "tab_06.htm")


# Plots and Tables for Online Appendix ----

# Figure A1 ----

dat_delibprocess <- read.csv("data_oecd_delibprocesses.csv",
                             fileEncoding = "UTF-8")

sum(dat_delibprocess$cases)

ggplot(dat_delibprocess, aes(x = year, y = cases)) +
    geom_bar(stat = "identity", fill = "grey50") + 
    geom_text(aes(label = cases), nudge_y = 2, colour = "grey50") +
    labs(x = "Year", y = "Number of Deliberative Processes")
ggsave("fig_A1.pdf", width = 9, height = 5)


# Figure A2 ----

dat <- dat %>% 
    mutate(group = dplyr::recode(group, "8th Amendment" = "Abortion (Eighth Amendment)"))

dat_sum_date <- dat %>% 
    select(docnum, date, session_type) %>% 
    unique() %>% 
    mutate(month = lubridate::floor_date(date, unit = "month")) %>% 
    group_by(month, session_type) %>% 
    count()

min(dat$date, na.rm = TRUE)
max(dat$date, na.rm = TRUE)

dat_sum_date$session_type <- factor(dat_sum_date$session_type,
                                    levels = c("Other", "Q&A", "Expert"))

ggplot(dat_sum_date, aes(x = month, y = session_type)) + 
    geom_point(aes(size = n, shape = session_type)) +
    scale_shape_manual(values = c(15, 1, 16), guide = NULL) +
    scale_size_continuous(breaks = c(1, 3, 6, 9),
                          name = "Number of\nAgenda Items") +
    scale_x_date(date_breaks = "2 months",
                 date_labels = "%b %Y") +
    theme(legend.position = "right",
          axis.text.x = element_text(angle = 90)) +
    labs(x = "Month", y = "Agenda Item") 

ggsave("fig_A2.pdf",
       width = 9, height = 3)


# Figure A3 ----

# Generate Diagnostic Values to Pick Topic Numbers


# write function based on tutorial by Julia Silge for assessing different numbers ot topics
# Note: the function validate_stm() takes an stm object as 
# the input, and the user specifies the range of topics to 
# be validated. We set seeds to ensure reproducibility


# link to original tutorial: https://juliasilge.com/blog/evaluating-stm/

plan(multisession)

validate_stm <- function(dfmat_stm, ks) {
    
    many_models <- tibble(K = ks ) %>%
        mutate(topic_model = future_map(K, ~stm(documents = dfmat_stm$documents, 
                                                vocab = dfmat_stm$vocab, 
                                                data = dfmat_stm$meta,
                                                K = .,
                                                seed = 1254, 
                                                verbose = TRUE),
                                        .options = furrr_options(seed = TRUE)))
    
    
    heldout <- make.heldout(documents = dfmat_stm$document,
                            vocab = dfmat_stm$vocab, 
                            seed = 123)
    
    k_result <- many_models %>%
        mutate(exclusivity = map(topic_model, exclusivity),
               semantic_coherence = map(topic_model, semanticCoherence, 
                                        documents = dfmat_stm$documents),
               eval_heldout = map(topic_model, eval.heldout, heldout$missing),
               residual = map(topic_model, checkResiduals, dfmat_stm$documents),
               bound =  map_dbl(topic_model, function(x) max(x$convergence$bound)),
               lfact = map_dbl(topic_model, function(x) lfactorial(x$settings$dim$K)),
               lbound = bound + lfact,
               iterations = map_dbl(topic_model, function(x) length(x$convergence$bound)))
    
    
}

plot_validate_stm <- function(x, title, topics_selected) {
    
    p_validate <- x %>%
        transmute(K,
                  `Lower Bound` = lbound,
                  Residuals = map_dbl(residual, "dispersion"),
                  `Semantic Coherence` = map_dbl(semantic_coherence, mean),
                  `Held-out Likelihood` = map_dbl(eval_heldout, "expected.heldout")) %>%
        gather(Metric, Value, -K) %>%
        ggplot(aes(K, Value)) +
        geom_line(size = 1.05, colour = "grey50", show.legend = FALSE) +
        facet_wrap(~Metric, scales = "free_y") +
        geom_vline(xintercept =  topics_selected, linetype = "dashed", colour = "red", size = 0.8) +
        theme(strip.background = element_blank(),
              strip.text.x = element_text(size = 13),
              plot.title = element_text(size = 15, face = "bold", hjust=  0.5)) +
        labs(x = "K (Number of Topics)",
             title = title,
             y = NULL)
    print(p_validate)
}



# convert the dfm to a format used for stm
# Eighth

dfm_eighth_topics_stm_many <- quanteda::convert(dfm_eighth_temp, to = "stm")

# Referenda

# convert the dfm to a format used for stm
dfm_referenda_topics_stm_many <- quanteda::convert(dfm_referenda_topics, to = "stm")

# Ageing

dfm_ageing_topics_stm_many <- quanteda::convert(dfm_ageing_topics, to = "stm")

# Climate

dfm_climate_topics_stm_many <- quanteda::convert(dfm_climate_topics, to='stm')


# Climate change

ks_climate <- seq(4, 20, 1)

models_climate <- validate_stm(dfmat_stm = dfm_climate_topics_stm_many,
                               ks = ks_climate)

p_climate <- plot_validate_stm(models_climate, title = "Climate Change",
                               topics_selected = 9)


# Ageing population

ks_ageing <- seq(4, 20, 1)

models_ageing <- validate_stm(dfmat_stm = dfm_ageing_topics_stm_many,
                              ks = ks_ageing)

p_ageing <- plot_validate_stm(models_ageing, 
                              title = "Ageing Population",
                              topics_selected = 8)


# Abortion 

ks_abortion <- seq(4, 20, 1)

models_abortion <- validate_stm(dfmat_stm = dfm_eighth_topics_stm_many,
                                ks = ks_abortion)


p_abortion <- plot_validate_stm(models_abortion, 
                                title = "Abortion (Eighth Amendment)",
                                topics_selected = 10)


# Referenda

ks_referenda <- seq(4, 20, 1)

models_referenda <- validate_stm(dfmat_stm = dfm_referenda_topics_stm_many,
                                 ks = ks_referenda)


p_referenda <- plot_validate_stm(models_referenda, 
                                 title = "Referenda",
                                 topics_selected = 7)



# combine plots
plot_grid(p_abortion,
          p_ageing,
          p_climate,
          p_referenda, 
          nrow = 2,
          scale = 0.9)

ggsave("fig_A3.pdf",
       width = 11.5, height = 11.5)


# Table A1 ----

# necessary function to create table
addtorow <-  list()
addtorow$pos <- list()
addtorow$pos[[1]] <- c(0)
addtorow$command <- c(paste("\\hline \n",
                            "\\endhead \n",
                            "\\hline \n",
                            "\\endfoot \n",
                            "\\endlastfoot \n", sep = ""))


tstat_sum_table <- dat %>% 
    mutate(qa_dummy = ifelse(session_type == "Q&A", 1,0),
           other_dummy = ifelse(session_type == "Other", 1,0),
           expert_dummy = ifelse(session_type == "Expert", 1, 0)) %>% 
    select(group, docnum, qa_dummy, other_dummy, expert_dummy) %>% 
    unique() %>% 
    group_by(group) %>% 
    summarise(
        `Expert Inputs` = sum(expert_dummy),
        `Q&A` = sum(qa_dummy),
        `Other` = sum(other_dummy)) %>% 
    rename(Topic = group)

print(xtable(tstat_sum_table, 
             type = "latex",
             label="tab:summary_effective",
             caption = "Overview of documents in text corpus (documents used in the regression analysis)",
             digits = 0), 
      include.rownames = FALSE,
      caption.placement = "top",
      format.args=list(big.mark = ","), 
      size = "footnotesize",
      file = "tab_A1.tex")


# Qualitative Coding ----
dat_da_raw <- rbind(eighth, 
                    ageing_population, 
                    referenda,
                    climate_change,
                    stringsAsFactors = FALSE)


dat_da_select <- dat_da_raw %>% 
    select(date, title_youtube, group, session_item_id = doc_id) %>% 
    unique()



# get Q&A sessions

dat_qa <- filter(dat_da_raw, session_type == "Q&A") %>% 
    select(group, order, url_youtube = url_youtube.x, 
           doc_id, cc_available) %>% 
    unique()

# set seed and sample one video per topic

set.seed(135)

dat_qa_sample <- dat_qa %>% 
    group_by(group) %>% 
    sample_n(size = 1)

# write.csv(dat_qa_sample, "data_qa_sampled.csv")


# Table A2 ----

# load hand-coded samples
dat_qual <- read.csv("data_coding_qualitative.csv")

dat_qual <- dat_qual %>% 
    filter(type != "Other") %>% # only keep questions and answers
    mutate(response_to_question = str_replace_all(response_to_question, "Direct Response", "Direct response")) %>% 
    mutate(expert_type = str_replace_all(expert_type, "Other Expert", "Expert"))# %>% 

# join with details on videos
dat_qual <- left_join(dat_qual, dat_da_select) %>% 
    mutate(group = dplyr::recode(group, "8th Amendment"= "Abortion (Eighth Amendment)"))


table(dat_qual$type)

# > table(dat$type)
# 
# Answer Question 
# 81       50

# function to count number of words
nwords <- function(string, pseudo=F){
    ifelse( pseudo, 
            pattern <- "\\S+", 
            pattern <- "[[:alpha:]]+" 
    )
    str_count(string, pattern)
}


dat_qual$nwords <- nwords(dat_qual$text_extract)

# overview of length

check_tab <- dat_qual %>% 
    group_by(
        type, expert_type) %>% 
    summarise(mean_nchar = mean(nwords),
              median_nchar = median(nwords),
              max_nchar = max(nwords),
              min_nchar = min(nwords),
              sum_nchar = sum(nwords)) %>% 
    arrange(type)

check_tab

names(dat_qual)

dat_count_type <- dat_qual %>% 
    count(type)

dat_count_type

dat_count_responses <- dat_qual %>% 
    filter(type == "Answer") %>% 
    group_by(response_to_question, expert_type) %>% 
    count() %>% 
    ungroup() %>% 
    mutate(prop = n / sum(n)) %>% 
    mutate(response_to_question = paste0(response_to_question, 
                                         " to question"))

table(dat_qual$type)
table(dat_qual$response_to_question)

dat_count_responses_binary <- dat_qual %>% 
    filter(type == "Answer") %>% 
    mutate(direct_dummy = dplyr::recode(
        response_to_question, 
        "Very direct response" = "Direct response",
        "Only marginally direct" = "Marginally direct/no response",
        "No response" = "Marginally direct/no response"))


table(dat_count_responses_binary$direct_dummy)

table(dat_count_responses_binary$direct_dummy)

prop.table(table(dat_count_responses_binary$direct_dummy))


dat_prop_binary <- dat_count_responses_binary %>% 
    group_by(expert_type,
             direct_dummy) %>% 
    count() %>% 
    group_by(expert_type) %>% 
    mutate(prop = n / sum(n)) %>% 
    filter(expert_type == "Expert")



dat_prop_binary_session <- dat_count_responses_binary %>% 
    group_by(expert_type, group, 
             date,
             #response_to_question, 
             direct_dummy) %>% 
    count() %>% 
    group_by(group, expert_type) %>% 
    mutate(prop = n / sum(n)) %>% 
    filter(expert_type == "Expert") %>% 
    ungroup() %>% 
    mutate(prop = paste0(round(prop * 100, 1), "%", " (", n, ")")) %>% 
    select(-c(expert_type, n)) %>% 
    spread(direct_dummy, prop, fill = "0%") %>% 
    select(Issue = group, Date = date,
           `Direct response`,
           `Marginally direct/no response`)


dat_prop_binary_session_with_mod <- dat_count_responses_binary %>% 
    group_by(expert_type, direct_dummy) %>% 
    mutate(words_mean = mean(nwords),
           words_sum = sum(nwords),
           words_sd = sd(nwords)) %>% 
    group_by(expert_type, 
             direct_dummy, words_mean, words_sum, 
             words_sd) %>% 
    count() %>% 
    group_by(expert_type) %>% 
    mutate(prop = n / sum(n)) %>% 
    ungroup() %>% 
    mutate(prop = paste0(round(prop * 100, 1), "%", " (", n, ")")) %>% 
    select(Person = expert_type,
           Response = direct_dummy,
           `Percent (Freq)` = prop,
           `Words (Sum)`= words_sum,
           `Words (Mean)` = words_mean,
           `Words (SD)` = words_sd)

14132 / sum(dat_prop_binary_session_with_mod$`Words (Sum)`)



dat_prop_binary_session <- dat_prop_binary_session |> 
    mutate(Date = strftime(Date, '%d %B %Y'))

## Table 3 and Table A2 ----

# print dataframe
dat_prop_binary_session


print(xtable(dat_prop_binary_session,
             caption = "Coding results of expert responses to questions",
             label="tab:responses_session",
             align= c("p{0.05\\textwidth}", 
                      "p{0.3\\textwidth}", 
                      "p{0.25\\textwidth}", 
                      # "p{0.1\\textwidth}",
                      # "p{0.15\\textwidth}",
                      # "p{0.25\\textwidth}",
                      "p{0.15\\textwidth}",
                      "p{0.15\\textwidth}")),
      type = "latex",
      size = "footnotesize",
      format.args=list(big.mark = ","), 
      file="tab_03.tex",
      include.rownames = FALSE,
      caption.placement = "top")

print(xtable(dat_prop_binary_session),
      type = "html",
      size = "footnotesize",
      format.args=list(big.mark = ","), 
      file="tab_03.html",
      include.rownames = FALSE,
      caption.placement = "top")


# print data frame
dat_prop_binary_session_with_mod

print(xtable(dat_prop_binary_session_with_mod,
             caption = "Coding results of responses to questions, considering expert responses and responses from the moderator",
             label="tab:responses_session_exp_mod",
             digits = 1,
             big.mark = ",",
             align= c("p{0.05\\textwidth}", 
                      "p{0.15\\textwidth}", 
                      "p{0.3\\textwidth}",
                      "p{0.15\\textwidth}",
                      "p{0.08\\textwidth}",
                      "p{0.08\\textwidth}",
                      "p{0.08\\textwidth}")),
      type = "latex",
      format.args=list(big.mark = ","), 
      size = "footnotesize",
      file="tab_A2.tex",
      include.rownames = FALSE,
      caption.placement = "top")


# Table A3 ----

# Fractional logistic regression model with mean value

glm_avg_continuous <- glm(qa_topic_prop_avg ~
                              topic_prop * session_type +
                              duration_expert +
                              duration_qa +
                              group +  
                              topic_group +
                              num_docs, 
                          data = dat_da, 
                          family = quasibinomial('logit'))

glm_avg_continuous_noproc <- glm(qa_topic_prop_avg ~
                                     topic_prop * session_type +
                                     duration_expert +
                                     duration_qa +
                                     group +  
                                     topic_group +
                                     num_docs, 
                                 data = dat_da_noprocedure, 
                                 family = quasibinomial('logit'))

# regression model (average Topic Prevalence) 
# with discrete independent variable

glm_avg_discrete <- glm(qa_topic_prop_avg ~
                            topic_prop_discrete * session_type +
                            duration_expert +
                            duration_qa +
                            group +
                            topic_group +
                            num_docs, 
                        data = dat_da, 
                        family = quasibinomial('logit'))

glm_avg_discrete_noproc <- glm(qa_topic_prop_avg ~
                                   topic_prop_discrete * session_type +
                                   duration_expert +
                                   duration_qa +
                                   group +
                                   topic_group +
                                   num_docs, 
                               data = dat_da_noprocedure, 
                               family = quasibinomial('logit'))
p_discrete_avg <- ggpredict(glm_avg_discrete, 
                            terms = c("topic_prop_discrete", "session_type")) 


texreg(list(glm_avg_continuous, glm_avg_continuous_noproc,
            glm_avg_discrete, glm_avg_discrete_noproc),
       omit.coef = c("(Intercept)|topic_group*|group*|duration_*|num_docs*"),
       fontsize = "footnotesize",
       include.aic = FALSE,
       include.bic = FALSE,
       include.loglik = FALSE,
       custom.model.names = c(
           "M 1 (full)",
           "M 2 (subset)",
           "M 3 (full)",
           "M 4 (subset)"
       ),
       caption.above = TRUE,
       custom.coef.map = list(
           "topic_prop" = "Topic Prevalence",
           "session_typeExpert Input" = "Expert Input",
           "topic_prop:session_typeExpert Input" = "Topic Prevalence $\\times$ Expert Input",
           "topic_prop_discreteMedium" = "Topic Prevalence (Discrete): Medium",
           "topic_prop_discreteHigh" = "Topic Prevalence (Discrete): High",
           "topic_prop_discreteMedium:session_typeExpert Input" = "Topic Prevalence (Discrete): Medium $\\times$ Expert Input",
           "topic_prop_discreteHigh:session_typeExpert Input" = "Topic Prevalence (Discrete): High $\\times$ Expert Input"
       ),
       caption = "Predicting issue emphasis in Q\\&A sessions (using the average topic prevalences as dependent variable)",
       label = "tab:main_proc_avg",
       custom.gof.rows = list("Controls" = c(rep("\\checkmark", 4)),
                              "Fixed Effects: 4 Issues" = c(rep("\\checkmark", 4)),
                              "Fixed Effects: Topics" = c(rep("\\checkmark", 4))),
       file = "tab_A3.tex")

# Figure A4 ----

# Plot predicted values
p_cont_avg <- ggpredict(glm_avg_continuous, 
                        terms = c("topic_prop [all]", "session_type")) 



plot_continuous(x = p_cont_avg,
                ylab = "Predicted Average Topic Prevalence\nAcross Subsequent Q&A Sessions") +
    geom_rug(aes(x = topic_prop), data = dat_da,
             inherit.aes = FALSE, colour = "grey20")
ggsave("fig_A4.pdf",
       width = 9, height = 5)


# Figure A5 ----

plot_discrete(x = p_discrete_avg,
              ylab = "Predicted Average Topic Prevalence\nAcross Subsequent Q&A Sessions")

ggsave("fig_A5.pdf",
       width = 9, height = 5)


# Figure A6 ----
# jackknife type regression model

topic_group <- unique(dat_da$group)
topic_group

tidy_jacknife_main_topic <- data.frame()

for (i in topic_group) {
    
    glm_subset <- glm(qa_topic_prop_max ~
                          topic_prop * session_type +
                          duration_expert +
                          duration_qa +
                          group +  
                          topic_group +
                          num_docs, 
                      data = filter(dat_da, group != i),
                      family = quasibinomial('logit'))
    
    tidy_glm_subset <- tidy(glm_subset) %>% 
        mutate(excluded = i)
    
    tidy_jacknife_main_topic <- bind_rows(tidy_jacknife_main_topic,
                                          tidy_glm_subset)
}



names(tidy_jacknife_main_topic)
table(tidy_jacknife_main_topic$term)

tidy_jackknife <- tidy_jacknife_main_topic %>% 
    filter(term %in% c("session_typeExpert Input",
                       "topic_prop", 
                       "topic_prop:session_typeExpert Input")) %>% 
    mutate(term = dplyr::recode(term, 
                                "session_typeExpert Input" = "Expert Input",
                                "topic_prop" = "Topic Prevalence",
                                "topic_prop:session_typeExpert Input" = "Topic Prevalence x Expert Input"),
    ) %>% 
    mutate(excluded = paste0("Exclude:\n", excluded))



ggplot(tidy_jackknife, aes(x = term, y = estimate,
                           ymin = estimate - 1.96 * std.error,
                           ymax = estimate + 1.96  * std.error)) +
    geom_pointrange(position = position_dodge(width = 0.3)) +
    coord_flip() + 
    facet_wrap(~excluded) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    labs(y = "Coefficient (and 95% Confidence Interval)",
         x = NULL) +
    theme(legend.position = "right", axis.text.y = element_text(hjust = 1)) 
ggsave("fig_A6.pdf",
       width = 9, height = 3.5)



# Figure A7 ----

dat_da <- dat_da %>% 
    mutate(topic_subtopic = paste(group, topic_name, sep = ": "))

topic_group_subtopic <- unique(dat_da$topic_subtopic)

tidy_jackknife_subtopic <- data.frame()

for (i in topic_group_subtopic) {
    
    glm_subset <- glm(qa_topic_prop_max ~
                          topic_prop * session_type +
                          duration_expert +
                          duration_qa +
                          group +  
                          topic_group +
                          num_docs, 
                      data = filter(dat_da, topic_subtopic != i),
                      family = quasibinomial('logit'))
    
    
    tidy_glm_subset <- tidy(glm_subset) %>% 
        mutate(excluded = i)
    
    tidy_jackknife_subtopic <- bind_rows(tidy_jackknife_subtopic,
                                         tidy_glm_subset)
}



tidy_jackknife_subtopic <- tidy_jackknife_subtopic %>%
    filter(term %in% c("topic_prop:session_typeExpert Input")) %>% 
    mutate(term = dplyr::recode(term, 
                                "session_typeExpert Input" = "Expert Input",
                                "topic_prop" = "Topic Prevalence",
                                "topic_prop:session_typeExpert Input" = "Topic Prevalence x Expert Input"),
    ) %>% 
    arrange(term) 

nrow(tidy_jackknife_subtopic)

ggplot(tidy_jackknife_subtopic, aes(x = reorder(excluded, estimate), 
                                    y = estimate,
                                    ymin = estimate - 1.96 * std.error,
                                    ymax = estimate + 1.96  * std.error)) +
    geom_pointrange(position = position_dodge(width = 0.3)) +
    coord_flip() + 
    geom_hline(yintercept = 0, linetype = "dashed") +
    labs(y = "Coefficient for Topic Prevalence x Expert Input\n(and 95% Confidence Interval)",
         x = "Excluded Topic") +
    theme(legend.position = "right",
          axis.text.y = element_text(hjust = 1)) 
ggsave("fig_A7.pdf",
       width = 9, height = 10)



# Figure A8 ----

# run jackknife-style regression models

# get all speakers
speakers <- unique(dat_highest_experts$speaker)

# empty data frame to store coefficients
tidy_reduced_all <- data.frame()

for (i in speakers) {
    cat("Exclude", i, "\n")
    
    # remove expert i from sample
    dat_reduced <- filter(dat_highest_experts, 
                          speaker != i)
    
    
    # run model without expert i
    lm_reduced <- lm(qa_minus_expert ~ gender + 
                         position_expert + 
                         group + num_docs,
                     data = dat_reduced)
    
    tidy_reduced <- broom::tidy(lm_reduced) |> 
        mutate(exclude = i) # add information removed expert
    
    # bind with data frame
    tidy_reduced_all <- bind_rows(tidy_reduced,
                                  tidy_reduced_all)
}

# clean data frame with coefficient
tidy_reduced_all_gender <- tidy_reduced_all |> 
    filter(term == "genderFemale" | term == "position_expertExpert: Academia") |> 
    mutate(exclude = str_squish(exclude)) |> 
    mutate(exclude = str_replace_all(exclude, "’", "'")) |> 
    mutate(term = gsub("gender|position_expert", "", term)) |>
    mutate(term = dplyr::recode(term, "Female" = "Gender: Female")) |> 
    mutate(term = factor(term, levels = c("Gender: Female", "Expert: Academia")))

# create plot
ggplot(tidy_reduced_all_gender, 
       aes(
           x = fct_rev(exclude),
           y = estimate,
           ymin = estimate - 1.96 * std.error, 
           ymax = estimate + 1.96 * std.error)) +
    geom_point() +
    geom_pointrange() +
    coord_flip() +
    facet_wrap(~term) +
    labs(y = "Coefficient on Expert Influence", 
         x = "Excluded Expert") +
    geom_hline(yintercept = 0,
               colour = "black",
               size = 0.8,
               linetype = "dashed") +
    scale_y_continuous(limits = c(-0.6, 0.2)) +
    theme(axis.text.y = element_text(hjust = 1))
ggsave("fig_A8.pdf",
       width = 9, height = 13)


## script ends here -- continue with 02_Analysis_Chunks.R
